199 lines
7.2 KiB
Python
199 lines
7.2 KiB
Python
###################################################
|
|
#___ ___ _ _ _ #
|
|
#| \/ | | | | | | | #
|
|
#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #
|
|
#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
|
|
#| | | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
|
|
#\_| |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
|
|
###################################################
|
|
|
|
|
|
# This script retrieves toots from a Mastodon instance and saves them as json files. It is intented to be run as a cron job.
|
|
#
|
|
|
|
# Currently it retrieves the public and local timelines from the last hour.
|
|
|
|
# The Mastodon instance to query is defined in the variable 'instance'.
|
|
|
|
# The data is saved in the 'data' directory in a subdirectory named after the instance.
|
|
# The data is saved in json files, one for each hour, in subdirectories 'timeline_public' and 'timeline_local'.
|
|
|
|
# The script uses the Mastodon.py library to interact with the Mastodon API: https://pypi.org/project/Mastodon.py/
|
|
# For instructions on using the Mastodon.py library see: https://mastodonpy.readthedocs.io/en/stable/
|
|
|
|
|
|
################### Mastodon instance to query #################################
|
|
# Depending on the size of the querried instances, do a test run to see if the #
|
|
# script can collect all the requested within an hour, otherwise an hourly #
|
|
# automation might run into trouble. #
|
|
################################################################################
|
|
|
|
instances = []
|
|
|
|
################### Params for data retrieval ##################################
|
|
|
|
retrieve_timeline_public = True
|
|
retrieve_timeline_local = True
|
|
|
|
################### Path for log file ##########################################
|
|
# Please enter the full (not relative) path, where the path where log file #
|
|
# should be saved and the desired file name itself #
|
|
################################################################################
|
|
|
|
log_file='mastodata_local_pubic.log'
|
|
|
|
|
|
|
|
import os
|
|
import json
|
|
import requests
|
|
import logging
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from mastodon import Mastodon
|
|
|
|
def check_dir(path):
|
|
if not os.path.isdir(path):
|
|
os.makedirs(path)
|
|
|
|
|
|
################# Logging level ###############################################
|
|
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s')
|
|
|
|
|
|
################# Hourly Timeline Collection ##################################
|
|
################# Fetching local timeline #####################################
|
|
|
|
def get_timeline_local(mastodon, outpath_base, instance, hours=1):
|
|
|
|
toots = []
|
|
now = pd.Timestamp('now', tz='utc')
|
|
# set 'now' to zero to collect only "whole" hours
|
|
now = now.replace(minute=0, second=0, microsecond=0)
|
|
timeoffset = pd.DateOffset(hours=hours)
|
|
since = now - timeoffset
|
|
|
|
get_more = True
|
|
max_id=None
|
|
outpath = os.path.join(outpath_base, 'timeline_local')
|
|
check_dir(outpath)
|
|
while get_more:
|
|
# Call the correct API-method, 40 posts per request is the current max. limit
|
|
tmp = mastodon.timeline_local(limit=40, max_id=max_id)
|
|
if len(tmp) > 0:
|
|
toots += tmp
|
|
max_id = tmp[-1]['id']
|
|
if tmp[-1]['created_at'] < since:
|
|
get_more = False
|
|
else:
|
|
get_more = False
|
|
|
|
toots = [toot for toot in toots if toot['created_at'] >= since]
|
|
toots = [toot for toot in toots if toot['created_at'] <= now]
|
|
#Logging amount of retrieved toots
|
|
logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_local for {instance}: {len(toots)}.')
|
|
if len(toots) > 0:
|
|
outfile = os.path.join(outpath, f'{now}.json')
|
|
# write collected toots to json files
|
|
with open(outfile, 'w') as of:
|
|
txt = json.dumps(toots, indent=4, default=str)
|
|
of.write(txt)
|
|
|
|
|
|
################# Fetching public timeline #####################################
|
|
|
|
def get_timeline_public(mastodon, outpath_base, instance, hours=1):
|
|
toots = []
|
|
now = pd.Timestamp('now', tz='utc')
|
|
now = now.replace(minute=0, second=0, microsecond=0)
|
|
timeoffset = pd.DateOffset(hours=hours)
|
|
since = now - timeoffset
|
|
|
|
get_more = True
|
|
max_id=None
|
|
outpath = os.path.join(outpath_base, 'timeline_public')
|
|
check_dir(outpath)
|
|
while get_more:
|
|
tmp = mastodon.timeline_public(limit=40, max_id=max_id)
|
|
if len(tmp) > 0:
|
|
toots += tmp
|
|
max_id = tmp[-1]['id']
|
|
if tmp[-1]['created_at'] < since:
|
|
get_more = False
|
|
else:
|
|
get_more = False
|
|
|
|
toots = [toot for toot in toots if toot['created_at'] >= since]
|
|
toots = [toot for toot in toots if toot['created_at'] <= now]
|
|
|
|
logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_public for {instance}: {len(toots)}.')
|
|
if len(toots) > 0:
|
|
outfile = os.path.join(outpath, f'{now}.json')
|
|
|
|
with open(outfile, 'w') as of:
|
|
txt = json.dumps(toots, indent=4, default=str)
|
|
of.write(txt)
|
|
|
|
|
|
|
|
|
|
#### The function 'query_instance' starts the collection of timelines
|
|
|
|
def query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local):
|
|
|
|
#### execute the function for every instance given in params ######
|
|
for instance in instances:
|
|
|
|
|
|
### Generate secrets
|
|
### params #############################
|
|
app_name = f'mastodata-{instance}'
|
|
api_base_url = f'https://{instance}'
|
|
|
|
#### Push declaration of the path in the folowing of condition, since
|
|
#### the paths to the timeline data differ
|
|
|
|
outpath_base = os.path.join('data_test', f'{instance}')
|
|
#######################################
|
|
|
|
#os.path.join nutzen
|
|
secrets_path = 'secrets'
|
|
check_dir(secrets_path)
|
|
app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
|
|
|
|
Mastodon.create_app(
|
|
app_name,
|
|
api_base_url = api_base_url,
|
|
to_file = app_secret_file
|
|
)
|
|
|
|
mastodon = Mastodon(client_id = app_secret_file)
|
|
# Es braucht einen Test, ob eine Mastodon Instanz Authentifizierung benötigt. Workouraound mit try except implementiert
|
|
|
|
### Implement the if else scheme from Mastidata hashtag here as well
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
if retrieve_timeline_public:
|
|
print(f' {datetime.now()}: Started retrieving public timeline from {instance}.') # later to be realized as log entry
|
|
get_timeline_public(mastodon, instance, outpath_base)
|
|
if retrieve_timeline_local:
|
|
print(f' {datetime.now()}: Started retrieving local timeline from {instance}.') # later to be realized as log entry
|
|
get_timeline_local(mastodon, instance, outpath_base)
|
|
|
|
except:
|
|
logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
|
|
|
|
|
|
|
|
####### Fetch data for specified instance from the previous hour #############
|
|
|
|
query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local)
|
|
|
|
|
|
|
|
|