Mastodata/Mastodata_local+public.py

###################################################
#___  ___          _            _       _         #
#|  \/  |         | |          | |     | |        #
#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #
#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
#| |  | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
#\_|  |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
###################################################


# This script retrieves toots from a Mastodon instance and saves them as json files. It is intented to be run as a cron job.
#

# Currently it retrieves the public and local timelines from the last hour.

# The Mastodon instance to query is defined in the variable 'instance'.

# The data is saved in the 'data' directory in a subdirectory named after the instance.
# The data is saved in json files, one for each hour, in subdirectories 'timeline_public' and 'timeline_local'.

# The script uses the Mastodon.py library to interact with the Mastodon API: https://pypi.org/project/Mastodon.py/
# For instructions on using the Mastodon.py library see: https://mastodonpy.readthedocs.io/en/stable/


################### Mastodon instance to query #################################
# Depending on the size of the querried instances, do a test run to see if the #
# script can collect all the requested within an hour, otherwise an hourly     #
# automation might run into trouble.                                           #
################################################################################

instances = []

################### Params for data retrieval ##################################

retrieve_timeline_public = True
retrieve_timeline_local = True

################### Path for log file ##########################################
# Please enter the full (not relative) path, where the path where log file     #
# should be saved and the desired file name itself                             #
################################################################################

log_file='mastodata_local_pubic.log'


import os
import json
import requests
import logging
import pandas as pd
from datetime import datetime
from mastodon import Mastodon

def check_dir(path):
    if not os.path.isdir(path):
        os.makedirs(path)


################# Logging level ###############################################
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s')


################# Hourly Timeline Collection ##################################
################# Fetching local timeline #####################################

def get_timeline_local(mastodon, outpath_base, instance, hours=1):

    toots = []
    now = pd.Timestamp('now', tz='utc')
    # set 'now' to zero to collect only "whole" hours
    now = now.replace(minute=0, second=0, microsecond=0)
    timeoffset = pd.DateOffset(hours=hours)
    since = now - timeoffset

    get_more = True
    max_id=None
    outpath = os.path.join(outpath_base, 'timeline_local')
    check_dir(outpath)
    while get_more:
        # Call the correct API-method, 40 posts per request is the current max. limit
        tmp = mastodon.timeline_local(limit=40, max_id=max_id)
        if len(tmp) > 0:
            toots += tmp
            max_id = tmp[-1]['id']
            if tmp[-1]['created_at'] < since:
                get_more = False
        else:
            get_more = False

    toots = [toot for toot in toots if toot['created_at'] >= since]
    toots = [toot for toot in toots if toot['created_at'] <= now]
    #Logging amount of retrieved toots
    logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_local for {instance}: {len(toots)}.')
    if len(toots) > 0:
        outfile = os.path.join(outpath, f'{now}.json')
        # write collected toots to json files
        with open(outfile, 'w') as of:
            txt = json.dumps(toots, indent=4, default=str)
            of.write(txt)


################# Fetching public timeline #####################################

def get_timeline_public(mastodon, outpath_base, instance, hours=1):
    toots = []
    now = pd.Timestamp('now', tz='utc')
    now = now.replace(minute=0, second=0, microsecond=0)
    timeoffset = pd.DateOffset(hours=hours)
    since = now - timeoffset

    get_more = True
    max_id=None
    outpath = os.path.join(outpath_base, 'timeline_public')
    check_dir(outpath)
    while get_more:
        tmp = mastodon.timeline_public(limit=40, max_id=max_id)
        if len(tmp) > 0:
            toots += tmp
            max_id = tmp[-1]['id']
            if tmp[-1]['created_at'] < since:
                get_more = False
        else:
            get_more = False

    toots = [toot for toot in toots if toot['created_at'] >= since]
    toots = [toot for toot in toots if toot['created_at'] <= now]

    logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_public for {instance}: {len(toots)}.')
    if len(toots) > 0:
        outfile = os.path.join(outpath, f'{now}.json')

        with open(outfile, 'w') as of:
            txt = json.dumps(toots, indent=4, default=str)
            of.write(txt)


#### The function 'query_instance' starts the collection of timelines

def query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local):

    #### execute the function for every instance given in params ######
    for instance in instances:


    ### Generate secrets
    ### params #############################
        app_name = f'mastodata-{instance}'
        api_base_url = f'https://{instance}'

        #### Push declaration of the path in the folowing of condition, since
        #### the paths to the timeline data differ

        outpath_base = os.path.join('data_test', f'{instance}')
        #######################################

        #os.path.join nutzen
        secrets_path = 'secrets'
        check_dir(secrets_path)
        app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'

        Mastodon.create_app(
            app_name,
            api_base_url = api_base_url,
            to_file = app_secret_file
        )

        mastodon = Mastodon(client_id = app_secret_file)
        # Es braucht einen Test, ob eine Mastodon Instanz Authentifizierung benötigt. Workouraound mit try except implementiert

        ### Implement the if else scheme from Mastidata hashtag here as well


        try:
            if retrieve_timeline_public:
                print(f' {datetime.now()}: Started retrieving public timeline from {instance}.') # later to be realized as log entry
                get_timeline_public(mastodon, instance, outpath_base)
            if retrieve_timeline_local:
                print(f' {datetime.now()}: Started retrieving local timeline from {instance}.') # later to be realized as log entry
                get_timeline_local(mastodon, instance, outpath_base)

        except:
            logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')


####### Fetch data for specified instance from the previous hour #############

query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local)