Add files via upload

Initial upload of Mastodata
2024-09-05 17:37:42 +02:00
parent b8f02d76f3
commit 7eff3ea12d
6 changed files with 1514 additions and 1 deletions
--- a/Mastodata_hashtag.py
+++ b/Mastodata_hashtag.py
@@ -0,0 +1,335 @@
+###################################################
+#___  ___          _            _       _         #
+#|  \/  |         | |          | |     | |        #
+#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #
+#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
+#| |  | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
+#\_|  |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
+################################################### 
+##############################################################################
+# This script retrieves posts from the hashtag timeline of mastodon. Two     #
+# different methods of data collection are possible. However, both methods   #
+# need you to enter one or more hashtags please insert them below in the     #
+# params. Each hashtag has to be in quotes and seperated by comma.           #
+# You can also decide to collect data from one or multiple instances.        #
+# Keep in mind, if you do not enter any instance name, every instance        #
+# registered on https://fedidb.org/ will be querried!                        #
+##############################################################################
+# 1. Hourly requests:                                                        #
+# This method is intended to be used by automating the script in a cron job. #
+# Data is saved in this directory pattern:                                   #
+# 'WHERE_YOU_EXECUTE_THE_SCRIPT/data/hashtag hourly - YOUR_HASHTAGS'         #
+# Data is saved in json files, one for each hour. The filename contains      #
+# the name of the instance from where it originates.                         #
+# Set retrieve_hourly_timeline_hashtag to True to use this method.           #
+# ############################################################################
+# 2. Requesting older posts:                                                 #
+# Here, the collection is NOT automated. For this method a starting point is #
+# defined in the params below. Follow the pattern of the example date.       #
+# Data is saved in this directory pattern:                                   #
+# 'WHERE_YOU_EXECUTE_THE_SCRIPT/data/hashtag                                 #
+#                                                                            # 
+##############################################################################
+# The script uses the Mastodon.py library to interact with the Mastodon API: #
+# https://pypi.org/project/Mastodon.py/                                      #
+# For instructions on using the Mastodon.py library see:                     #
+# https://mastodonpy.readthedocs.io/en/stable/                               #
+##############################################################################
+
+
+# Imported modules
+# https://pypi.org/project/Mastodon.py/
+# Based on: https://mastodonpy.readthedocs.io/en/stable/
+import os
+import json
+import logging
+import requests
+import pandas as pd
+import timeit
+import concurrent.futures
+import pytz
+from datetime import datetime, timedelta 
+from dateutil import parser
+from mastodon import Mastodon
+
+################### Params for execution #####################################
+
+# Set the method you want to use to "True". Remember, please use the methods 
+# seperatly,  not simultaniously.
+retrieve_hourly_timeline_hashtag = False
+retrieve_old_timeline_hashtag = True
+
+### Insert hashtags, according to the example given. ## 
+### Works with one as well as multiple hashtags. ######
+
+hashtags = ['#NAFO']
+
+# Please enter desired Mastodon instances domain. All instances must be inside 
+# the brackets and each of them in quotes.
+# If left empty all instances from fedidb are retrieved and queried (~2300).
+instances = []
+
+### Params for the retrieval of older posts.
+utc=pytz.UTC
+#Insert starting point for requesting older posts. 
+start_date = '2024-06-20'
+since = parser.parse(start_date)
+since = utc.localize(since)
+
+###############################################################################
+
+#### Params for retrieval of instance list via fedidb API #####################
+
+keep = 'Mastodon'
+fedidb_url = 'https://api.fedidb.org/v1/servers/'
+
+limit = 40
+params = {
+    'limit': limit
+}
+
+###############################################################################
+###Check for and change to correct path
+
+def check_dir(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+
+# Logging
+# This script uses print statements in the terminal output, to inform you if a
+# critical error occurs. Other information, e.g. amount of collected posts is
+# saved to 'mastodata_hashtag.log'. 
+# Most loggings are also print statements so you can see them if you execute 
+# the script via terminal.
+logging.basicConfig(filename='mastodata_hashtag.log', level=logging.INFO, format='%(asctime)s %(message)s')
+
+
+#######################################################################
+############ Using fedidb to get a list of mastodon domains     #######
+# This step is skipped, if you wish to collect data only from   #######
+# specific isntances.                                           #######
+#######################################################################
+# Collecting domains from fedidb and extracting Mastodon #      #######
+# domains is done in two steps. First query_fedidb collects     #######
+# the domains of registered instances with fedidb's API.        #######
+# More info here: https://fedidb.org/docs/api/v1                #######
+# Domains are saved in 'fediverse'.                             #######
+# Afterwards 'get_intances' extracts only the mastodon domains. #######
+# Domains are saved in 'instances'.                             #######
+#######################################################################         
+
+######### Getting fedidb instances#####################################        
+        
+def query_fedidb(url, params=dict()):
+    fediverse = []
+    more = True
+    while more:
+
+            if 'limit=' not in url:
+                response = requests.get(url, params=params)
+            else:
+                response = requests.get(url)
+
+
+            if response.status_code == 200:
+                data_raw = response.json()
+            else:
+                data_raw = dict()
+
+            #Overwrite initial request link with new link
+            if 'links' in data_raw and 'next' in data_raw['links']:
+                url = (data_raw['links']['next'])
+            else:
+                more = False
+
+            if url is None:
+                more = False
+
+            fediverse.extend(data_raw['data'])
+    return fediverse
+
+######## Extracting Mastodon domains and writing them to 'instances' #####
+
+def get_instances(url, params=dict()):
+    fediverse = query_fedidb(url, params=params)
+    fediverse_df = pd.json_normalize(fediverse)
+    mastodon_df = fediverse_df[fediverse_df['software.name'] == keep]
+    return mastodon_df['domain'].tolist()
+
+##########################################################################
+################Getting old hashtag timeline##############################
+##########################################################################            
+def get_old_timeline_hashtag(instance,  hashtags, outpath_base, since=datetime.now()-timedelta(days=1), limit=40):
+
+    ### params #############################
+    app_name = f'mastodata-{instance}'
+    api_base_url = f'https://{instance}'
+
+    #os.path.join nutzen
+    secrets_path = 'secrets'
+    check_dir(secrets_path)
+    app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
+
+    # Generating access hash, saved in the folder "secrets",
+    # for each instance.
+    Mastodon.create_app(
+        app_name,
+        api_base_url = api_base_url,
+        to_file = app_secret_file
+    )
+
+    mastodon = Mastodon(client_id = app_secret_file)
+
+    # Checking if "hashtags" is a string or list
+    if isinstance(hashtags, str):
+        hashtags = [hashtags]
+    elif isinstance(hashtags, list):
+        hashtags = hashtags
+    else:
+        print('Hashtags must be a string or a list of strings')
+        return
+
+    now = pd.Timestamp('now', tz='utc') 
+    
+    # Handing over your date choice 
+    if isinstance(since, str):
+        since = parser.parse(since)
+    elif isinstance(since, datetime):
+        since = since
+    else:
+        print('Since must be a string or a datetime object')
+        return
+    # Collecting toots and writing them to 'toots'
+    toots = []
+    for hashtag in hashtags:
+
+        get_more = True
+        max_id=None
+        outpath = os.path.join(outpath_base)
+        check_dir(outpath)
+        while get_more:            
+            tmp = mastodon.timeline_hashtag(hashtag, limit=limit, max_id=max_id)
+            if len(tmp) > 0:
+                toots += tmp
+                max_id = tmp[-1]['id']
+                if tmp[-1]['created_at'] < since: 
+                    get_more = False
+            else:
+                get_more = False
+
+        toots = [toot for toot in toots if toot['created_at'] >= since]
+       
+        logging.info(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
+        print(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
+        if len(toots) > 0:
+            outfile = os.path.join(outpath, f'{hashtag}_{instance}_start_date_{since}_end_date_{now}.json')
+
+            with open(outfile, 'w') as of:
+                txt = json.dumps(toots, indent=4, default=str)
+                of.write(txt)         
+   
+#########################################################################     
+#################### Getting hourly hashtag timeline ####################        
+#########################################################################
+
+def get_hourly_timeline_hashtag(instance, hashtags, outpath_base, limit=40):
+
+        ### params #############################
+    app_name = f'mastodata-{instance}'
+    api_base_url = f'https://{instance}'
+
+
+    #os.path.join nutzen
+    secrets_path = 'secrets'
+    check_dir(secrets_path)
+    app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
+
+    Mastodon.create_app(
+        app_name,
+        api_base_url = api_base_url,
+        to_file = app_secret_file
+    )
+
+    mastodon = Mastodon(client_id = app_secret_file)
+     # Prüfen ob hastags ein string ist oder eine Liste:
+    if isinstance(hashtags, str):
+        hashtags = [hashtags]
+    elif isinstance(hashtags, list):
+        hashtags = hashtags
+    else:
+        print('Hashtags must be a string or a list of strings')
+        return
+
+    now = pd.Timestamp('now', tz='utc') 
+    now = now.replace(minute=0, second=0, microsecond=0)
+    timeoffset = pd.DateOffset(hours=1)
+    since = now - timeoffset
+
+    toots = []
+    for hashtag in hashtags:
+
+        get_more = True
+        max_id=None
+        outpath = os.path.join(outpath_base)
+        check_dir(outpath)
+        while get_more:            
+            tmp = mastodon.timeline_hashtag(hashtag, limit=limit, max_id=max_id)
+            if len(tmp) > 0:
+                toots += tmp
+                max_id = tmp[-1]['id']
+                if tmp[-1]['created_at'] < since: 
+                    get_more = False
+            else:
+                get_more = False
+
+        toots = [toot for toot in toots if toot['created_at'] >= since]
+        # Logging amount of retrieved toots per instance
+        logging.info(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
+        print(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
+        if len(toots) > 0:
+            outfile = os.path.join(outpath, f'{hashtag}_{instance}_{now}.json')
+
+            with open(outfile, 'w') as of:
+                txt = json.dumps(toots, indent=4, default=str)
+                of.write(txt)  
+
+
+def runner(instances):
+
+##### Get mastodon domains from 'instances###############  
+    if len(instances) == 0:
+        print('Query fedidb')
+        instances = get_instances(fedidb_url, params=params)
+
+    
+
+##### if-condition for the retrieval of older posts
+    if retrieve_old_timeline_hashtag == True:
+        for instance in instances:
+            outpath_base = os.path.join('data', f'hashtag search – {hashtags if isinstance(hashtags, str) else ", ".join(hashtags)}')
+            # Some instance block data collection via generic API access. 
+            # To prevent the entire collection from crashing "try" is used.
+            try:
+                get_old_timeline_hashtag(instance, hashtags, outpath_base=outpath_base, since=since, limit=40)
+            except:
+                print(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')  
+                logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
+
+##### if-condition for hourly collection
+
+    elif retrieve_hourly_timeline_hashtag == True: 
+        for instance in instances:
+            outpath_base = os.path.join('data', f'hashtag hourly - {hashtags if isinstance(hashtags, str) else ", ".join(hashtags)}')
+            try: 
+                get_hourly_timeline_hashtag(instance,  hashtags, outpath_base=outpath_base)
+            except:
+                print(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
+                logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
+   
+    else:
+        print('Please set one of the timeline retrievals to "True" in the parameters at the very top.')
+
+#Envokes every other function
+runner(instances)
+