336 lines
13 KiB
Python
336 lines
13 KiB
Python
###################################################
|
||
#___ ___ _ _ _ #
|
||
#| \/ | | | | | | | #
|
||
#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #
|
||
#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
|
||
#| | | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
|
||
#\_| |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
|
||
###################################################
|
||
##############################################################################
|
||
# This script retrieves posts from the hashtag timeline of mastodon. Two #
|
||
# different methods of data collection are possible. However, both methods #
|
||
# need you to enter one or more hashtags please insert them below in the #
|
||
# params. Each hashtag has to be in quotes and seperated by comma. #
|
||
# You can also decide to collect data from one or multiple instances. #
|
||
# Keep in mind, if you do not enter any instance name, every instance #
|
||
# registered on https://fedidb.org/ will be querried! #
|
||
##############################################################################
|
||
# 1. Hourly requests: #
|
||
# This method is intended to be used by automating the script in a cron job. #
|
||
# Data is saved in this directory pattern: #
|
||
# 'WHERE_YOU_EXECUTE_THE_SCRIPT/data/hashtag hourly - YOUR_HASHTAGS' #
|
||
# Data is saved in json files, one for each hour. The filename contains #
|
||
# the name of the instance from where it originates. #
|
||
# Set retrieve_hourly_timeline_hashtag to True to use this method. #
|
||
# ############################################################################
|
||
# 2. Requesting older posts: #
|
||
# Here, the collection is NOT automated. For this method a starting point is #
|
||
# defined in the params below. Follow the pattern of the example date. #
|
||
# Data is saved in this directory pattern: #
|
||
# 'WHERE_YOU_EXECUTE_THE_SCRIPT/data/hashtag #
|
||
# #
|
||
##############################################################################
|
||
# The script uses the Mastodon.py library to interact with the Mastodon API: #
|
||
# https://pypi.org/project/Mastodon.py/ #
|
||
# For instructions on using the Mastodon.py library see: #
|
||
# https://mastodonpy.readthedocs.io/en/stable/ #
|
||
##############################################################################
|
||
|
||
|
||
# Imported modules
|
||
# https://pypi.org/project/Mastodon.py/
|
||
# Based on: https://mastodonpy.readthedocs.io/en/stable/
|
||
import os
|
||
import json
|
||
import logging
|
||
import requests
|
||
import pandas as pd
|
||
import timeit
|
||
import concurrent.futures
|
||
import pytz
|
||
from datetime import datetime, timedelta
|
||
from dateutil import parser
|
||
from mastodon import Mastodon
|
||
|
||
################### Params for execution #####################################
|
||
|
||
# Set the method you want to use to "True". Remember, please use the methods
|
||
# seperatly, not simultaniously.
|
||
retrieve_hourly_timeline_hashtag = False
|
||
retrieve_old_timeline_hashtag = True
|
||
|
||
### Insert hashtags, according to the example given. ##
|
||
### Works with one as well as multiple hashtags. ######
|
||
|
||
hashtags = ['#NAFO']
|
||
|
||
# Please enter desired Mastodon instances domain. All instances must be inside
|
||
# the brackets and each of them in quotes.
|
||
# If left empty all instances from fedidb are retrieved and queried (~2300).
|
||
instances = []
|
||
|
||
### Params for the retrieval of older posts.
|
||
utc=pytz.UTC
|
||
#Insert starting point for requesting older posts.
|
||
start_date = '2024-06-20'
|
||
since = parser.parse(start_date)
|
||
since = utc.localize(since)
|
||
|
||
###############################################################################
|
||
|
||
#### Params for retrieval of instance list via fedidb API #####################
|
||
|
||
keep = 'Mastodon'
|
||
fedidb_url = 'https://api.fedidb.org/v1/servers/'
|
||
|
||
limit = 40
|
||
params = {
|
||
'limit': limit
|
||
}
|
||
|
||
###############################################################################
|
||
###Check for and change to correct path
|
||
|
||
def check_dir(path):
|
||
if not os.path.isdir(path):
|
||
os.makedirs(path)
|
||
|
||
|
||
# Logging
|
||
# This script uses print statements in the terminal output, to inform you if a
|
||
# critical error occurs. Other information, e.g. amount of collected posts is
|
||
# saved to 'mastodata_hashtag.log'.
|
||
# Most loggings are also print statements so you can see them if you execute
|
||
# the script via terminal.
|
||
logging.basicConfig(filename='mastodata_hashtag.log', level=logging.INFO, format='%(asctime)s %(message)s')
|
||
|
||
|
||
#######################################################################
|
||
############ Using fedidb to get a list of mastodon domains #######
|
||
# This step is skipped, if you wish to collect data only from #######
|
||
# specific isntances. #######
|
||
#######################################################################
|
||
# Collecting domains from fedidb and extracting Mastodon # #######
|
||
# domains is done in two steps. First query_fedidb collects #######
|
||
# the domains of registered instances with fedidb's API. #######
|
||
# More info here: https://fedidb.org/docs/api/v1 #######
|
||
# Domains are saved in 'fediverse'. #######
|
||
# Afterwards 'get_intances' extracts only the mastodon domains. #######
|
||
# Domains are saved in 'instances'. #######
|
||
#######################################################################
|
||
|
||
######### Getting fedidb instances#####################################
|
||
|
||
def query_fedidb(url, params=dict()):
|
||
fediverse = []
|
||
more = True
|
||
while more:
|
||
|
||
if 'limit=' not in url:
|
||
response = requests.get(url, params=params)
|
||
else:
|
||
response = requests.get(url)
|
||
|
||
|
||
if response.status_code == 200:
|
||
data_raw = response.json()
|
||
else:
|
||
data_raw = dict()
|
||
|
||
#Overwrite initial request link with new link
|
||
if 'links' in data_raw and 'next' in data_raw['links']:
|
||
url = (data_raw['links']['next'])
|
||
else:
|
||
more = False
|
||
|
||
if url is None:
|
||
more = False
|
||
|
||
fediverse.extend(data_raw['data'])
|
||
return fediverse
|
||
|
||
######## Extracting Mastodon domains and writing them to 'instances' #####
|
||
|
||
def get_instances(url, params=dict()):
|
||
fediverse = query_fedidb(url, params=params)
|
||
fediverse_df = pd.json_normalize(fediverse)
|
||
mastodon_df = fediverse_df[fediverse_df['software.name'] == keep]
|
||
return mastodon_df['domain'].tolist()
|
||
|
||
##########################################################################
|
||
################Getting old hashtag timeline##############################
|
||
##########################################################################
|
||
def get_old_timeline_hashtag(instance, hashtags, outpath_base, since=datetime.now()-timedelta(days=1), limit=40):
|
||
|
||
### params #############################
|
||
app_name = f'mastodata-{instance}'
|
||
api_base_url = f'https://{instance}'
|
||
|
||
#os.path.join nutzen
|
||
secrets_path = 'secrets'
|
||
check_dir(secrets_path)
|
||
app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
|
||
|
||
# Generating access hash, saved in the folder "secrets",
|
||
# for each instance.
|
||
Mastodon.create_app(
|
||
app_name,
|
||
api_base_url = api_base_url,
|
||
to_file = app_secret_file
|
||
)
|
||
|
||
mastodon = Mastodon(client_id = app_secret_file)
|
||
|
||
# Checking if "hashtags" is a string or list
|
||
if isinstance(hashtags, str):
|
||
hashtags = [hashtags]
|
||
elif isinstance(hashtags, list):
|
||
hashtags = hashtags
|
||
else:
|
||
print('Hashtags must be a string or a list of strings')
|
||
return
|
||
|
||
now = pd.Timestamp('now', tz='utc')
|
||
|
||
# Handing over your date choice
|
||
if isinstance(since, str):
|
||
since = parser.parse(since)
|
||
elif isinstance(since, datetime):
|
||
since = since
|
||
else:
|
||
print('Since must be a string or a datetime object')
|
||
return
|
||
# Collecting toots and writing them to 'toots'
|
||
toots = []
|
||
for hashtag in hashtags:
|
||
|
||
get_more = True
|
||
max_id=None
|
||
outpath = os.path.join(outpath_base)
|
||
check_dir(outpath)
|
||
while get_more:
|
||
tmp = mastodon.timeline_hashtag(hashtag, limit=limit, max_id=max_id)
|
||
if len(tmp) > 0:
|
||
toots += tmp
|
||
max_id = tmp[-1]['id']
|
||
if tmp[-1]['created_at'] < since:
|
||
get_more = False
|
||
else:
|
||
get_more = False
|
||
|
||
toots = [toot for toot in toots if toot['created_at'] >= since]
|
||
|
||
logging.info(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
|
||
print(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
|
||
if len(toots) > 0:
|
||
outfile = os.path.join(outpath, f'{hashtag}_{instance}_start_date_{since}_end_date_{now}.json')
|
||
|
||
with open(outfile, 'w') as of:
|
||
txt = json.dumps(toots, indent=4, default=str)
|
||
of.write(txt)
|
||
|
||
#########################################################################
|
||
#################### Getting hourly hashtag timeline ####################
|
||
#########################################################################
|
||
|
||
def get_hourly_timeline_hashtag(instance, hashtags, outpath_base, limit=40):
|
||
|
||
### params #############################
|
||
app_name = f'mastodata-{instance}'
|
||
api_base_url = f'https://{instance}'
|
||
|
||
|
||
#os.path.join nutzen
|
||
secrets_path = 'secrets'
|
||
check_dir(secrets_path)
|
||
app_secret_file = f'{secrets_path}/mastodata_{instance}.secret'
|
||
|
||
Mastodon.create_app(
|
||
app_name,
|
||
api_base_url = api_base_url,
|
||
to_file = app_secret_file
|
||
)
|
||
|
||
mastodon = Mastodon(client_id = app_secret_file)
|
||
# Prüfen ob hastags ein string ist oder eine Liste:
|
||
if isinstance(hashtags, str):
|
||
hashtags = [hashtags]
|
||
elif isinstance(hashtags, list):
|
||
hashtags = hashtags
|
||
else:
|
||
print('Hashtags must be a string or a list of strings')
|
||
return
|
||
|
||
now = pd.Timestamp('now', tz='utc')
|
||
now = now.replace(minute=0, second=0, microsecond=0)
|
||
timeoffset = pd.DateOffset(hours=1)
|
||
since = now - timeoffset
|
||
|
||
toots = []
|
||
for hashtag in hashtags:
|
||
|
||
get_more = True
|
||
max_id=None
|
||
outpath = os.path.join(outpath_base)
|
||
check_dir(outpath)
|
||
while get_more:
|
||
tmp = mastodon.timeline_hashtag(hashtag, limit=limit, max_id=max_id)
|
||
if len(tmp) > 0:
|
||
toots += tmp
|
||
max_id = tmp[-1]['id']
|
||
if tmp[-1]['created_at'] < since:
|
||
get_more = False
|
||
else:
|
||
get_more = False
|
||
|
||
toots = [toot for toot in toots if toot['created_at'] >= since]
|
||
# Logging amount of retrieved toots per instance
|
||
logging.info(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
|
||
print(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ')
|
||
if len(toots) > 0:
|
||
outfile = os.path.join(outpath, f'{hashtag}_{instance}_{now}.json')
|
||
|
||
with open(outfile, 'w') as of:
|
||
txt = json.dumps(toots, indent=4, default=str)
|
||
of.write(txt)
|
||
|
||
|
||
def runner(instances):
|
||
|
||
##### Get mastodon domains from 'instances###############
|
||
if len(instances) == 0:
|
||
print('Query fedidb')
|
||
instances = get_instances(fedidb_url, params=params)
|
||
|
||
|
||
|
||
##### if-condition for the retrieval of older posts
|
||
if retrieve_old_timeline_hashtag == True:
|
||
for instance in instances:
|
||
outpath_base = os.path.join('data', f'hashtag search – {hashtags if isinstance(hashtags, str) else ", ".join(hashtags)}')
|
||
# Some instance block data collection via generic API access.
|
||
# To prevent the entire collection from crashing "try" is used.
|
||
try:
|
||
get_old_timeline_hashtag(instance, hashtags, outpath_base=outpath_base, since=since, limit=40)
|
||
except:
|
||
print(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
|
||
logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
|
||
|
||
##### if-condition for hourly collection
|
||
|
||
elif retrieve_hourly_timeline_hashtag == True:
|
||
for instance in instances:
|
||
outpath_base = os.path.join('data', f'hashtag hourly - {hashtags if isinstance(hashtags, str) else ", ".join(hashtags)}')
|
||
try:
|
||
get_hourly_timeline_hashtag(instance, hashtags, outpath_base=outpath_base)
|
||
except:
|
||
print(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
|
||
logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.')
|
||
|
||
else:
|
||
print('Please set one of the timeline retrievals to "True" in the parameters at the very top.')
|
||
|
||
#Envokes every other function
|
||
runner(instances)
|
||
|