Add files via upload

Initial upload of Mastodata
2024-09-05 17:37:42 +02:00
parent b8f02d76f3
commit 7eff3ea12d
6 changed files with 1514 additions and 1 deletions
--- a/Mastodata_local+public.py
+++ b/Mastodata_local+public.py
@@ -0,0 +1,198 @@
+###################################################
+#___  ___          _            _       _         #
+#|  \/  |         | |          | |     | |        #
+#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #
+#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
+#| |  | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
+#\_|  |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
+################################################### 
+
+
+# This script retrieves toots from a Mastodon instance and saves them as json files. It is intented to be run as a cron job.
+# 
+
+# Currently it retrieves the public and local timelines from the last hour.
+
+# The Mastodon instance to query is defined in the variable 'instance'.
+
+# The data is saved in the 'data' directory in a subdirectory named after the instance.
+# The data is saved in json files, one for each hour, in subdirectories 'timeline_public' and 'timeline_local'.
+
+# The script uses the Mastodon.py library to interact with the Mastodon API: https://pypi.org/project/Mastodon.py/
+# For instructions on using the Mastodon.py library see: https://mastodonpy.readthedocs.io/en/stable/
+
+
+################### Mastodon instance to query ################################# 
+# Depending on the size of the querried instances, do a test run to see if the #
+# script can collect all the requested within an hour, otherwise an hourly     #
+# automation might run into trouble.                                           #
+################################################################################
+
+instances = []
+
+################### Params for data retrieval ##################################
+
+retrieve_timeline_public = True
+retrieve_timeline_local = True
+
+################### Path for log file ##########################################
+# Please enter the full (not relative) path, where the path where log file     #
+# should be saved and the desired file name itself                             #
+################################################################################
+
+log_file='mastodata_local_pubic.log'
+
+
+
+import os
+import json
+import requests
+import logging
+import pandas as pd
+from datetime import datetime
+from mastodon import Mastodon
+
+def check_dir(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+        
+        
+################# Logging level ###############################################
+logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s')
+
+
+################# Hourly Timeline Collection ##################################
+################# Fetching local timeline #####################################
+        
+def get_timeline_local(mastodon, outpath_base, instance, hours=1):
+        
+    toots = []
+    now = pd.Timestamp('now', tz='utc') 
+    # set 'now' to zero to collect only "whole" hours
+    now = now.replace(minute=0, second=0, microsecond=0)
+    timeoffset = pd.DateOffset(hours=hours)
+    since = now - timeoffset
+
+    get_more = True
+    max_id=None
+    outpath = os.path.join(outpath_base, 'timeline_local')
+    check_dir(outpath)
+    while get_more:
+        # Call the correct API-method, 40 posts per request is the current max. limit
+        tmp = mastodon.timeline_local(limit=40, max_id=max_id)
+        if len(tmp) > 0:
+            toots += tmp
+            max_id = tmp[-1]['id']
+            if tmp[-1]['created_at'] < since: 
+                get_more = False
+        else:
+            get_more = False
+
+    toots = [toot for toot in toots if toot['created_at'] >= since]
+    toots = [toot for toot in toots if toot['created_at'] <= now]
+    #Logging amount of retrieved toots
+    logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_local for {instance}: {len(toots)}.') 
+    if len(toots) > 0:
+        outfile = os.path.join(outpath, f'{now}.json')
+        # write collected toots to json files
+        with open(outfile, 'w') as of:
+            txt = json.dumps(toots, indent=4, default=str)
+            of.write(txt)
+
+            
+################# Fetching public timeline #####################################
+            
+def get_timeline_public(mastodon, outpath_base, instance, hours=1):
+    toots = []
+    now = pd.Timestamp('now', tz='utc') 
+    now = now.replace(minute=0, second=0, microsecond=0)
+    timeoffset = pd.DateOffset(hours=hours)
+    since = now - timeoffset
+
+    get_more = True
+    max_id=None
+    outpath = os.path.join(outpath_base, 'timeline_public')
+    check_dir(outpath)
+    while get_more:
+        tmp = mastodon.timeline_public(limit=40, max_id=max_id)
+        if len(tmp) > 0:
+            toots += tmp
+            max_id = tmp[-1]['id']
+            if tmp[-1]['created_at'] < since: 
+                get_more = False
+        else:
+            get_more = False
+
+    toots = [toot for toot in toots if toot['created_at'] >= since]
+    toots = [toot for toot in toots if toot['created_at'] <= now]
+
+    logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_public for {instance}: {len(toots)}.') 
+    if len(toots) > 0:
+        outfile = os.path.join(outpath, f'{now}.json')
+
+        with open(outfile, 'w') as of:
+            txt = json.dumps(toots, indent=4, default=str)
+            of.write(txt)
+
+
+            
+            
+#### The function 'query_instance' starts the collection of timelines
+
+def query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local):
+   
+    #### execute the function for every instance given in params ######
+    for instance in instances:
+    
+    
+    ### Generate secrets
+    ### params #############################
+        app_name = f'mastodata-{instance}'
+        api_base_url = f'https://{instance}'
+
+        #### Push declaration of the path in the folowing of condition, since
+        #### the paths to the timeline data differ
+
+        outpath_base = os.path.join('data_test', f'{instance}')
+        #######################################
+
+        #os.path.join nutzen
+        secrets_path = 'secrets'
+        check_dir(secrets_path)
+        app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
+
+        Mastodon.create_app(
+            app_name,
+            api_base_url = api_base_url,
+            to_file = app_secret_file
+        )
+
+        mastodon = Mastodon(client_id = app_secret_file)
+        # Es braucht einen Test, ob eine Mastodon Instanz Authentifizierung benötigt. Workouraound mit try except implementiert
+
+        ### Implement the if else scheme from Mastidata hashtag here as well
+
+
+
+
+
+        try:
+            if retrieve_timeline_public: 
+                print(f' {datetime.now()}: Started retrieving public timeline from {instance}.') # later to be realized as log entry
+                get_timeline_public(mastodon, instance, outpath_base)
+            if retrieve_timeline_local:
+                print(f' {datetime.now()}: Started retrieving local timeline from {instance}.') # later to be realized as log entry
+                get_timeline_local(mastodon, instance, outpath_base)
+
+        except:
+            logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
+
+
+
+####### Fetch data for specified instance from the previous hour #############
+
+query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local)
+
+
+
+