Add files via upload
Initial upload of Mastodata
This commit is contained in:
198
Mastodata_local+public.py
Normal file
198
Mastodata_local+public.py
Normal file
@@ -0,0 +1,198 @@
|
||||
###################################################
|
||||
#___ ___ _ _ _ #
|
||||
#| \/ | | | | | | | #
|
||||
#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #
|
||||
#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
|
||||
#| | | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
|
||||
#\_| |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
|
||||
###################################################
|
||||
|
||||
|
||||
# This script retrieves toots from a Mastodon instance and saves them as json files. It is intented to be run as a cron job.
|
||||
#
|
||||
|
||||
# Currently it retrieves the public and local timelines from the last hour.
|
||||
|
||||
# The Mastodon instance to query is defined in the variable 'instance'.
|
||||
|
||||
# The data is saved in the 'data' directory in a subdirectory named after the instance.
|
||||
# The data is saved in json files, one for each hour, in subdirectories 'timeline_public' and 'timeline_local'.
|
||||
|
||||
# The script uses the Mastodon.py library to interact with the Mastodon API: https://pypi.org/project/Mastodon.py/
|
||||
# For instructions on using the Mastodon.py library see: https://mastodonpy.readthedocs.io/en/stable/
|
||||
|
||||
|
||||
################### Mastodon instance to query #################################
|
||||
# Depending on the size of the querried instances, do a test run to see if the #
|
||||
# script can collect all the requested within an hour, otherwise an hourly #
|
||||
# automation might run into trouble. #
|
||||
################################################################################
|
||||
|
||||
instances = []
|
||||
|
||||
################### Params for data retrieval ##################################
|
||||
|
||||
retrieve_timeline_public = True
|
||||
retrieve_timeline_local = True
|
||||
|
||||
################### Path for log file ##########################################
|
||||
# Please enter the full (not relative) path, where the path where log file #
|
||||
# should be saved and the desired file name itself #
|
||||
################################################################################
|
||||
|
||||
log_file='mastodata_local_pubic.log'
|
||||
|
||||
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import logging
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from mastodon import Mastodon
|
||||
|
||||
def check_dir(path):
|
||||
if not os.path.isdir(path):
|
||||
os.makedirs(path)
|
||||
|
||||
|
||||
################# Logging level ###############################################
|
||||
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s')
|
||||
|
||||
|
||||
################# Hourly Timeline Collection ##################################
|
||||
################# Fetching local timeline #####################################
|
||||
|
||||
def get_timeline_local(mastodon, outpath_base, instance, hours=1):
|
||||
|
||||
toots = []
|
||||
now = pd.Timestamp('now', tz='utc')
|
||||
# set 'now' to zero to collect only "whole" hours
|
||||
now = now.replace(minute=0, second=0, microsecond=0)
|
||||
timeoffset = pd.DateOffset(hours=hours)
|
||||
since = now - timeoffset
|
||||
|
||||
get_more = True
|
||||
max_id=None
|
||||
outpath = os.path.join(outpath_base, 'timeline_local')
|
||||
check_dir(outpath)
|
||||
while get_more:
|
||||
# Call the correct API-method, 40 posts per request is the current max. limit
|
||||
tmp = mastodon.timeline_local(limit=40, max_id=max_id)
|
||||
if len(tmp) > 0:
|
||||
toots += tmp
|
||||
max_id = tmp[-1]['id']
|
||||
if tmp[-1]['created_at'] < since:
|
||||
get_more = False
|
||||
else:
|
||||
get_more = False
|
||||
|
||||
toots = [toot for toot in toots if toot['created_at'] >= since]
|
||||
toots = [toot for toot in toots if toot['created_at'] <= now]
|
||||
#Logging amount of retrieved toots
|
||||
logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_local for {instance}: {len(toots)}.')
|
||||
if len(toots) > 0:
|
||||
outfile = os.path.join(outpath, f'{now}.json')
|
||||
# write collected toots to json files
|
||||
with open(outfile, 'w') as of:
|
||||
txt = json.dumps(toots, indent=4, default=str)
|
||||
of.write(txt)
|
||||
|
||||
|
||||
################# Fetching public timeline #####################################
|
||||
|
||||
def get_timeline_public(mastodon, outpath_base, instance, hours=1):
|
||||
toots = []
|
||||
now = pd.Timestamp('now', tz='utc')
|
||||
now = now.replace(minute=0, second=0, microsecond=0)
|
||||
timeoffset = pd.DateOffset(hours=hours)
|
||||
since = now - timeoffset
|
||||
|
||||
get_more = True
|
||||
max_id=None
|
||||
outpath = os.path.join(outpath_base, 'timeline_public')
|
||||
check_dir(outpath)
|
||||
while get_more:
|
||||
tmp = mastodon.timeline_public(limit=40, max_id=max_id)
|
||||
if len(tmp) > 0:
|
||||
toots += tmp
|
||||
max_id = tmp[-1]['id']
|
||||
if tmp[-1]['created_at'] < since:
|
||||
get_more = False
|
||||
else:
|
||||
get_more = False
|
||||
|
||||
toots = [toot for toot in toots if toot['created_at'] >= since]
|
||||
toots = [toot for toot in toots if toot['created_at'] <= now]
|
||||
|
||||
logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_public for {instance}: {len(toots)}.')
|
||||
if len(toots) > 0:
|
||||
outfile = os.path.join(outpath, f'{now}.json')
|
||||
|
||||
with open(outfile, 'w') as of:
|
||||
txt = json.dumps(toots, indent=4, default=str)
|
||||
of.write(txt)
|
||||
|
||||
|
||||
|
||||
|
||||
#### The function 'query_instance' starts the collection of timelines
|
||||
|
||||
def query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local):
|
||||
|
||||
#### execute the function for every instance given in params ######
|
||||
for instance in instances:
|
||||
|
||||
|
||||
### Generate secrets
|
||||
### params #############################
|
||||
app_name = f'mastodata-{instance}'
|
||||
api_base_url = f'https://{instance}'
|
||||
|
||||
#### Push declaration of the path in the folowing of condition, since
|
||||
#### the paths to the timeline data differ
|
||||
|
||||
outpath_base = os.path.join('data_test', f'{instance}')
|
||||
#######################################
|
||||
|
||||
#os.path.join nutzen
|
||||
secrets_path = 'secrets'
|
||||
check_dir(secrets_path)
|
||||
app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
|
||||
|
||||
Mastodon.create_app(
|
||||
app_name,
|
||||
api_base_url = api_base_url,
|
||||
to_file = app_secret_file
|
||||
)
|
||||
|
||||
mastodon = Mastodon(client_id = app_secret_file)
|
||||
# Es braucht einen Test, ob eine Mastodon Instanz Authentifizierung benötigt. Workouraound mit try except implementiert
|
||||
|
||||
### Implement the if else scheme from Mastidata hashtag here as well
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
try:
|
||||
if retrieve_timeline_public:
|
||||
print(f' {datetime.now()}: Started retrieving public timeline from {instance}.') # later to be realized as log entry
|
||||
get_timeline_public(mastodon, instance, outpath_base)
|
||||
if retrieve_timeline_local:
|
||||
print(f' {datetime.now()}: Started retrieving local timeline from {instance}.') # later to be realized as log entry
|
||||
get_timeline_local(mastodon, instance, outpath_base)
|
||||
|
||||
except:
|
||||
logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
|
||||
|
||||
|
||||
|
||||
####### Fetch data for specified instance from the previous hour #############
|
||||
|
||||
query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local)
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user