diff --git a/Mastodata_hashtag.py b/Mastodata_hashtag.py new file mode 100644 index 0000000..e88a314 --- /dev/null +++ b/Mastodata_hashtag.py @@ -0,0 +1,335 @@ +################################################### +#___ ___ _ _ _ # +#| \/ | | | | | | | # +#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ # +#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | # +#| | | | (_| \__ \ || (_) | (_| | (_| | || (_| | # +#\_| |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| # +################################################### +############################################################################## +# This script retrieves posts from the hashtag timeline of mastodon. Two # +# different methods of data collection are possible. However, both methods # +# need you to enter one or more hashtags please insert them below in the # +# params. Each hashtag has to be in quotes and seperated by comma. # +# You can also decide to collect data from one or multiple instances. # +# Keep in mind, if you do not enter any instance name, every instance # +# registered on https://fedidb.org/ will be querried! # +############################################################################## +# 1. Hourly requests: # +# This method is intended to be used by automating the script in a cron job. # +# Data is saved in this directory pattern: # +# 'WHERE_YOU_EXECUTE_THE_SCRIPT/data/hashtag hourly - YOUR_HASHTAGS' # +# Data is saved in json files, one for each hour. The filename contains # +# the name of the instance from where it originates. # +# Set retrieve_hourly_timeline_hashtag to True to use this method. # +# ############################################################################ +# 2. Requesting older posts: # +# Here, the collection is NOT automated. For this method a starting point is # +# defined in the params below. Follow the pattern of the example date. # +# Data is saved in this directory pattern: # +# 'WHERE_YOU_EXECUTE_THE_SCRIPT/data/hashtag # +# # +############################################################################## +# The script uses the Mastodon.py library to interact with the Mastodon API: # +# https://pypi.org/project/Mastodon.py/ # +# For instructions on using the Mastodon.py library see: # +# https://mastodonpy.readthedocs.io/en/stable/ # +############################################################################## + + +# Imported modules +# https://pypi.org/project/Mastodon.py/ +# Based on: https://mastodonpy.readthedocs.io/en/stable/ +import os +import json +import logging +import requests +import pandas as pd +import timeit +import concurrent.futures +import pytz +from datetime import datetime, timedelta +from dateutil import parser +from mastodon import Mastodon + +################### Params for execution ##################################### + +# Set the method you want to use to "True". Remember, please use the methods +# seperatly, not simultaniously. +retrieve_hourly_timeline_hashtag = False +retrieve_old_timeline_hashtag = True + +### Insert hashtags, according to the example given. ## +### Works with one as well as multiple hashtags. ###### + +hashtags = ['#NAFO'] + +# Please enter desired Mastodon instances domain. All instances must be inside +# the brackets and each of them in quotes. +# If left empty all instances from fedidb are retrieved and queried (~2300). +instances = [] + +### Params for the retrieval of older posts. +utc=pytz.UTC +#Insert starting point for requesting older posts. +start_date = '2024-06-20' +since = parser.parse(start_date) +since = utc.localize(since) + +############################################################################### + +#### Params for retrieval of instance list via fedidb API ##################### + +keep = 'Mastodon' +fedidb_url = 'https://api.fedidb.org/v1/servers/' + +limit = 40 +params = { + 'limit': limit +} + +############################################################################### +###Check for and change to correct path + +def check_dir(path): + if not os.path.isdir(path): + os.makedirs(path) + + +# Logging +# This script uses print statements in the terminal output, to inform you if a +# critical error occurs. Other information, e.g. amount of collected posts is +# saved to 'mastodata_hashtag.log'. +# Most loggings are also print statements so you can see them if you execute +# the script via terminal. +logging.basicConfig(filename='mastodata_hashtag.log', level=logging.INFO, format='%(asctime)s %(message)s') + + +####################################################################### +############ Using fedidb to get a list of mastodon domains ####### +# This step is skipped, if you wish to collect data only from ####### +# specific isntances. ####### +####################################################################### +# Collecting domains from fedidb and extracting Mastodon # ####### +# domains is done in two steps. First query_fedidb collects ####### +# the domains of registered instances with fedidb's API. ####### +# More info here: https://fedidb.org/docs/api/v1 ####### +# Domains are saved in 'fediverse'. ####### +# Afterwards 'get_intances' extracts only the mastodon domains. ####### +# Domains are saved in 'instances'. ####### +####################################################################### + +######### Getting fedidb instances##################################### + +def query_fedidb(url, params=dict()): + fediverse = [] + more = True + while more: + + if 'limit=' not in url: + response = requests.get(url, params=params) + else: + response = requests.get(url) + + + if response.status_code == 200: + data_raw = response.json() + else: + data_raw = dict() + + #Overwrite initial request link with new link + if 'links' in data_raw and 'next' in data_raw['links']: + url = (data_raw['links']['next']) + else: + more = False + + if url is None: + more = False + + fediverse.extend(data_raw['data']) + return fediverse + +######## Extracting Mastodon domains and writing them to 'instances' ##### + +def get_instances(url, params=dict()): + fediverse = query_fedidb(url, params=params) + fediverse_df = pd.json_normalize(fediverse) + mastodon_df = fediverse_df[fediverse_df['software.name'] == keep] + return mastodon_df['domain'].tolist() + +########################################################################## +################Getting old hashtag timeline############################## +########################################################################## +def get_old_timeline_hashtag(instance, hashtags, outpath_base, since=datetime.now()-timedelta(days=1), limit=40): + + ### params ############################# + app_name = f'mastodata-{instance}' + api_base_url = f'https://{instance}' + + #os.path.join nutzen + secrets_path = 'secrets' + check_dir(secrets_path) + app_secret_file = f'{secrets_path}/mastodata_{instance}.secret' + + # Generating access hash, saved in the folder "secrets", + # for each instance. + Mastodon.create_app( + app_name, + api_base_url = api_base_url, + to_file = app_secret_file + ) + + mastodon = Mastodon(client_id = app_secret_file) + + # Checking if "hashtags" is a string or list + if isinstance(hashtags, str): + hashtags = [hashtags] + elif isinstance(hashtags, list): + hashtags = hashtags + else: + print('Hashtags must be a string or a list of strings') + return + + now = pd.Timestamp('now', tz='utc') + + # Handing over your date choice + if isinstance(since, str): + since = parser.parse(since) + elif isinstance(since, datetime): + since = since + else: + print('Since must be a string or a datetime object') + return + # Collecting toots and writing them to 'toots' + toots = [] + for hashtag in hashtags: + + get_more = True + max_id=None + outpath = os.path.join(outpath_base) + check_dir(outpath) + while get_more: + tmp = mastodon.timeline_hashtag(hashtag, limit=limit, max_id=max_id) + if len(tmp) > 0: + toots += tmp + max_id = tmp[-1]['id'] + if tmp[-1]['created_at'] < since: + get_more = False + else: + get_more = False + + toots = [toot for toot in toots if toot['created_at'] >= since] + + logging.info(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ') + print(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ') + if len(toots) > 0: + outfile = os.path.join(outpath, f'{hashtag}_{instance}_start_date_{since}_end_date_{now}.json') + + with open(outfile, 'w') as of: + txt = json.dumps(toots, indent=4, default=str) + of.write(txt) + +######################################################################### +#################### Getting hourly hashtag timeline #################### +######################################################################### + +def get_hourly_timeline_hashtag(instance, hashtags, outpath_base, limit=40): + + ### params ############################# + app_name = f'mastodata-{instance}' + api_base_url = f'https://{instance}' + + + #os.path.join nutzen + secrets_path = 'secrets' + check_dir(secrets_path) + app_secret_file = f'{secrets_path}/mastodata_{instance}.secret' + + Mastodon.create_app( + app_name, + api_base_url = api_base_url, + to_file = app_secret_file + ) + + mastodon = Mastodon(client_id = app_secret_file) + # Prüfen ob hastags ein string ist oder eine Liste: + if isinstance(hashtags, str): + hashtags = [hashtags] + elif isinstance(hashtags, list): + hashtags = hashtags + else: + print('Hashtags must be a string or a list of strings') + return + + now = pd.Timestamp('now', tz='utc') + now = now.replace(minute=0, second=0, microsecond=0) + timeoffset = pd.DateOffset(hours=1) + since = now - timeoffset + + toots = [] + for hashtag in hashtags: + + get_more = True + max_id=None + outpath = os.path.join(outpath_base) + check_dir(outpath) + while get_more: + tmp = mastodon.timeline_hashtag(hashtag, limit=limit, max_id=max_id) + if len(tmp) > 0: + toots += tmp + max_id = tmp[-1]['id'] + if tmp[-1]['created_at'] < since: + get_more = False + else: + get_more = False + + toots = [toot for toot in toots if toot['created_at'] >= since] + # Logging amount of retrieved toots per instance + logging.info(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ') + print(f'{datetime.now()}: Number of toots retrieved from {instance} for #{hashtag}: {len(toots)} ') + if len(toots) > 0: + outfile = os.path.join(outpath, f'{hashtag}_{instance}_{now}.json') + + with open(outfile, 'w') as of: + txt = json.dumps(toots, indent=4, default=str) + of.write(txt) + + +def runner(instances): + +##### Get mastodon domains from 'instances############### + if len(instances) == 0: + print('Query fedidb') + instances = get_instances(fedidb_url, params=params) + + + +##### if-condition for the retrieval of older posts + if retrieve_old_timeline_hashtag == True: + for instance in instances: + outpath_base = os.path.join('data', f'hashtag search – {hashtags if isinstance(hashtags, str) else ", ".join(hashtags)}') + # Some instance block data collection via generic API access. + # To prevent the entire collection from crashing "try" is used. + try: + get_old_timeline_hashtag(instance, hashtags, outpath_base=outpath_base, since=since, limit=40) + except: + print(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.') + logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.') + +##### if-condition for hourly collection + + elif retrieve_hourly_timeline_hashtag == True: + for instance in instances: + outpath_base = os.path.join('data', f'hashtag hourly - {hashtags if isinstance(hashtags, str) else ", ".join(hashtags)}') + try: + get_hourly_timeline_hashtag(instance, hashtags, outpath_base=outpath_base) + except: + print(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.') + logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.') + + else: + print('Please set one of the timeline retrievals to "True" in the parameters at the very top.') + +#Envokes every other function +runner(instances) + diff --git a/Mastodata_local+public.py b/Mastodata_local+public.py new file mode 100644 index 0000000..3da71cb --- /dev/null +++ b/Mastodata_local+public.py @@ -0,0 +1,198 @@ +################################################### +#___ ___ _ _ _ # +#| \/ | | | | | | | # +#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ # +#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | # +#| | | | (_| \__ \ || (_) | (_| | (_| | || (_| | # +#\_| |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| # +################################################### + + +# This script retrieves toots from a Mastodon instance and saves them as json files. It is intented to be run as a cron job. +# + +# Currently it retrieves the public and local timelines from the last hour. + +# The Mastodon instance to query is defined in the variable 'instance'. + +# The data is saved in the 'data' directory in a subdirectory named after the instance. +# The data is saved in json files, one for each hour, in subdirectories 'timeline_public' and 'timeline_local'. + +# The script uses the Mastodon.py library to interact with the Mastodon API: https://pypi.org/project/Mastodon.py/ +# For instructions on using the Mastodon.py library see: https://mastodonpy.readthedocs.io/en/stable/ + + +################### Mastodon instance to query ################################# +# Depending on the size of the querried instances, do a test run to see if the # +# script can collect all the requested within an hour, otherwise an hourly # +# automation might run into trouble. # +################################################################################ + +instances = [] + +################### Params for data retrieval ################################## + +retrieve_timeline_public = True +retrieve_timeline_local = True + +################### Path for log file ########################################## +# Please enter the full (not relative) path, where the path where log file # +# should be saved and the desired file name itself # +################################################################################ + +log_file='mastodata_local_pubic.log' + + + +import os +import json +import requests +import logging +import pandas as pd +from datetime import datetime +from mastodon import Mastodon + +def check_dir(path): + if not os.path.isdir(path): + os.makedirs(path) + + +################# Logging level ############################################### +logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s') + + +################# Hourly Timeline Collection ################################## +################# Fetching local timeline ##################################### + +def get_timeline_local(mastodon, outpath_base, instance, hours=1): + + toots = [] + now = pd.Timestamp('now', tz='utc') + # set 'now' to zero to collect only "whole" hours + now = now.replace(minute=0, second=0, microsecond=0) + timeoffset = pd.DateOffset(hours=hours) + since = now - timeoffset + + get_more = True + max_id=None + outpath = os.path.join(outpath_base, 'timeline_local') + check_dir(outpath) + while get_more: + # Call the correct API-method, 40 posts per request is the current max. limit + tmp = mastodon.timeline_local(limit=40, max_id=max_id) + if len(tmp) > 0: + toots += tmp + max_id = tmp[-1]['id'] + if tmp[-1]['created_at'] < since: + get_more = False + else: + get_more = False + + toots = [toot for toot in toots if toot['created_at'] >= since] + toots = [toot for toot in toots if toot['created_at'] <= now] + #Logging amount of retrieved toots + logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_local for {instance}: {len(toots)}.') + if len(toots) > 0: + outfile = os.path.join(outpath, f'{now}.json') + # write collected toots to json files + with open(outfile, 'w') as of: + txt = json.dumps(toots, indent=4, default=str) + of.write(txt) + + +################# Fetching public timeline ##################################### + +def get_timeline_public(mastodon, outpath_base, instance, hours=1): + toots = [] + now = pd.Timestamp('now', tz='utc') + now = now.replace(minute=0, second=0, microsecond=0) + timeoffset = pd.DateOffset(hours=hours) + since = now - timeoffset + + get_more = True + max_id=None + outpath = os.path.join(outpath_base, 'timeline_public') + check_dir(outpath) + while get_more: + tmp = mastodon.timeline_public(limit=40, max_id=max_id) + if len(tmp) > 0: + toots += tmp + max_id = tmp[-1]['id'] + if tmp[-1]['created_at'] < since: + get_more = False + else: + get_more = False + + toots = [toot for toot in toots if toot['created_at'] >= since] + toots = [toot for toot in toots if toot['created_at'] <= now] + + logging.info(f'{datetime.now()}: Number of toots retrieved from timeline_public for {instance}: {len(toots)}.') + if len(toots) > 0: + outfile = os.path.join(outpath, f'{now}.json') + + with open(outfile, 'w') as of: + txt = json.dumps(toots, indent=4, default=str) + of.write(txt) + + + + +#### The function 'query_instance' starts the collection of timelines + +def query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local): + + #### execute the function for every instance given in params ###### + for instance in instances: + + + ### Generate secrets + ### params ############################# + app_name = f'mastodata-{instance}' + api_base_url = f'https://{instance}' + + #### Push declaration of the path in the folowing of condition, since + #### the paths to the timeline data differ + + outpath_base = os.path.join('data_test', f'{instance}') + ####################################### + + #os.path.join nutzen + secrets_path = 'secrets' + check_dir(secrets_path) + app_secret_file = f'{secrets_path}/mastodata_{instance}.secret' + + Mastodon.create_app( + app_name, + api_base_url = api_base_url, + to_file = app_secret_file + ) + + mastodon = Mastodon(client_id = app_secret_file) + # Es braucht einen Test, ob eine Mastodon Instanz Authentifizierung benötigt. Workouraound mit try except implementiert + + ### Implement the if else scheme from Mastidata hashtag here as well + + + + + + try: + if retrieve_timeline_public: + print(f' {datetime.now()}: Started retrieving public timeline from {instance}.') # later to be realized as log entry + get_timeline_public(mastodon, instance, outpath_base) + if retrieve_timeline_local: + print(f' {datetime.now()}: Started retrieving local timeline from {instance}.') # later to be realized as log entry + get_timeline_local(mastodon, instance, outpath_base) + + except: + logging.error(f'Error retrieving data from instance: {instance}. Check if instance requires authentication.') + + + +####### Fetch data for specified instance from the previous hour ############# + +query_instance(instances, retrieve_timeline_public=retrieve_timeline_public, retrieve_timeline_local=retrieve_timeline_local) + + + + diff --git a/README.md b/README.md index 518dd47..d709c42 100644 --- a/README.md +++ b/README.md @@ -1 +1,42 @@ -# Mastodata \ No newline at end of file +Mastodata is a tool set to collect timeline data from Mastodon instances and to display and analyze the collected data. Data collection is based on the Mastodon API and the python wrapper for it (Mastodon.py). + +1) Data collection: +Mastodata_hashtag.py and Mastodata_Local_Public.py are used to collect data and save it in json files. While the time frame of the collected data is flexible for Mastodata_hashtag.py, Mastodata_Local_Public.py needs to be automated. In test runs requests were conducted on an hourly basis, automated by cron. + +2) Data display and analysis: +The analysis notebook files visualize data in different ways. analysis_notebook_hashtag.ipynb and analysis_notebook_local+public.ipynb visualize the data of their counterparts from data collection, only a path of the folder containing the json files needs to be provided. +analysis_notebook_fedidb.ipynb however does not display saved files. It requests all self-registered servers/instances listed on 'fedidb.org'. This includes also non-Mastodon sites, e.g. using Pleroma, Lemmy, etc. It was mainly used to extract Mastodon domains but can also be modified to filter for domains of any social media type on the Fediverse. + +Files: + +analysis_notebook_fedidb.ipynb + +analysis_notebook_hashtag.ipynb + +analysis_notebook_local+public.ipynb + +Mastodata_hashtag.py + +Mastodata_Local_Public.py + +Installation: +After installing the requirements from the requirenments.txt file Mastodata files work "right out of the box". Mastodata was only tested on Python 3.10.14 and above. + + +Usage: + +Detailled description can be found within the first section of each file. + + +Logs: + +Logs are only created for Mastodata_Local_Public.py (mastodata_local_pubic.log) and Mastodata_hashtag.py (mastodata_hashtag.log). +For Mastodata_Local_Public.py the log contains the amount of collected posts for each instance and timeline (local and public) specified as well as the time of the code's execution. +Mastodata_hashtag.py contains the same information. However the more instances posts get collected from, the larger the log gets. Keep that in mind when doing Mastodon wide requests. +Error messages contain the name of the instance, where the request failed. This usually happens, when an instance has their public API access disabled. + + + + + + diff --git a/analysis_notebook_fedidb.ipynb b/analysis_notebook_fedidb.ipynb new file mode 100644 index 0000000..682acad --- /dev/null +++ b/analysis_notebook_fedidb.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "506cc9c0-cb5d-4db9-b37d-e487947f9e1b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###################################################\n", + "#___ ___ _ _ _ #\n", + "#| \\/ | | | | | | | #\n", + "#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n", + "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n", + "#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n", + "#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n", + "################################################### \n", + "# Mastodata is divided into da retrieval scripts and\n", + "# data loading scripts.\n", + "# This script analyzes data hosted by 'fedidb.org', while\n", + "# also retrieving them before the analysis. \n", + "# Fedidb.org itself already has an analytical section.\n", + "# Due to that analysis here is very basic.\n", + "# For further analysis please fedidb.org itself.\n", + "\n", + "\n", + "#### Retrieve all fediverse instances listed in fedibd an filter for Mastodon domains ######\n", + "\n", + "import requests\n", + "import json\n", + "import pandas as pd\n", + "\n", + "keep = 'Mastodon'\n", + "url = 'https://api.fedidb.org/v1/servers/'\n", + "\n", + "limit = 40\n", + "params = {\n", + " 'limit': limit\n", + "}\n", + "\n", + "\n", + "def query_fedidb(url, params=dict()):\n", + " fediverse = []\n", + " more = True\n", + " while more:\n", + "\n", + "\n", + " if 'limit=' not in url:\n", + " response = requests.get(url, params=params)\n", + " else:\n", + " response = requests.get(url)\n", + "\n", + "\n", + " if response.status_code == 200:\n", + " data_raw = response.json()\n", + " else:\n", + " data_raw = dict()\n", + "\n", + " ####Overwrite initial link with new request link###\n", + " if 'links' in data_raw and 'next' in data_raw['links']:\n", + " url = (data_raw['links']['next'])\n", + " else:\n", + " more = False\n", + "\n", + " if url is None:\n", + " print('NONE')\n", + " more = False\n", + "\n", + " fediverse.extend(data_raw['data'])\n", + " return fediverse\n", + "\n", + "fediverse = query_fedidb(url, params=params)\n", + "fediverse_df = pd.json_normalize(fediverse)\n", + "### Rename categories to counter problems due to naming \n", + "fediverse_df.rename(columns = {'location.city':'location_city', 'location.country':'location_country',\n", + " 'software.id':'software_id', 'software.name':'software_name', 'software.url':'software_url', 'software.version':'software_version',\n", + " 'stats.status_count':'status_count', 'stats.user_count':'user_count', 'stats.monthly_active_users':'monthly_active_users'}, inplace=True)\n", + "mastodon_df = fediverse_df[fediverse_df['software_name'] == keep]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "263bd2cd-2587-4a40-baff-f5bf1054fd03", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Show categories\n", + "fediverse_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7486c203-1ef9-452f-b629-abd0889321fe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mastodon_df.rename(columns = {'location.city':'location_city', 'location.country':'location_country',\n", + " 'software.id':'software_id', 'software.name':'software_name', 'software.url':'software_url', 'software.version':'software_version',\n", + " 'stats.status_count':'stats_status_count', 'stats.user_count':'stats_user_count', 'stats.monthly_active_users':'stats_monthly_active_users'}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "6bb4122a-5540-40ea-82ba-d5da33e1ac7f", + "metadata": {}, + "source": [ + "## Analysis\n", + "\n", + "Quantitative information on users, monthly active users and statuses: \n", + " 1.1 Average users\n", + " 1.2 Average (monthly) active users\n", + " 1.3.Average toots\n", + " 1.4 Averag toots per user\n", + " 1.5. Average toots per monthly user\n", + " \n", + "For additional graphs, etc. Please visit 'fedidb.org'.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69014130-9237-4fb2-96ab-5f1a83371b95", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#print the stats of the entire Mastodon Dataframe\n", + "print('Fediverse:', len (fediverse_df))\n", + "print('Mastodon:', len(mastodon_df))\n", + "print('Mastodon stats from FediDB')\n", + "print('Average users on Mastodon per Instance:',mastodon_df['user_count'].sum()/len(mastodon_df))\n", + "print('Average of (monthly) active users on Mastodon per Instance:',mastodon_df['monthly_active_users'].sum()/len(mastodon_df))\n", + "print('Average toots per Instance:',mastodon_df['status_count'].sum() / len(mastodon_df))\n", + "print('Average toots per User:',mastodon_df['status_count'].sum() / mastodon_df['user_count'].sum())\n", + "print('Average toots per monthly active user (MAU):',mastodon_df['status_count'].sum() / mastodon_df['monthly_active_users'].sum())\n", + "print('Instances with open registration:', mastodon_df['open_registration'].sum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a1c204-16c0-4e79-a5b6-fe4ce82b67b2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# If you want to export one of the data frames as a csv, please uncomment the according command below\n", + "# Exports are saved in the same folder as where this script is executed\n", + "\n", + "# Uncomment below, to receive a csv-file from 'fediverse_df' containing information on EVERY registered service on fedidb.org\n", + "# Remember to enter the correct path before the filename.\n", + "#fediverse_df.to_csv('fediverse_df.csv')\n", + "\n", + "# Uncomment below, to receive a csv-file from 'mastodon_df', containing information ONLY on mastodon instances \n", + "#mastodon_df.to_csv('mastodon_df.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6796ed5-e456-4ee7-a6dd-b19953f7c865", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# List Mastodon instances from highest post count to lowest, with a selection of categories.\n", + "mastodon_df[['id', 'domain', 'status_count', 'user_count', 'monthly_active_users', 'location_country', 'open_registration']].sort_values(by = 'status_count', ascending=[False]).head(50)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis_notebook_hashtag.ipynb b/analysis_notebook_hashtag.ipynb new file mode 100644 index 0000000..97e058b --- /dev/null +++ b/analysis_notebook_hashtag.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9493a46f-12c3-4582-9723-7352e6aaeb95", + "metadata": {}, + "outputs": [], + "source": [ + "###################################################\n", + "#___ ___ _ _ _ #\n", + "#| \\/ | | | | | | | #\n", + "#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n", + "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n", + "#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n", + "#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n", + "####################################################### \n", + "# This script loads data from the hash tag timeline.\n", + "# Please fill in the correct path, where your data \n", + "# is stored.\n", + "#######################################################\n", + "# At the current stage, mastodata data loaders are meant\n", + "# to briefly explore collected data, filtered and then \n", + "# exported. You can filter by date and hashtag, for \n", + "# public and local timeline.\n", + "#######################################################\n", + "# This notebook loads data collected from the hashtag\n", + "# timeline. Currently there exist 3 collected hashtags:\n", + "# #russia, #ukraine and #fediblock.\n", + "# Posts were collected from EVERY instance avilable on\n", + "#'fedidb.org'.\n", + "#######################################################\n", + "# Three analytical sections are in this notebook.\n", + "# 1. Descriptive overview of the material\n", + "# 2. Co-Hashtags\n", + "# 3. Federation\n", + "#######################################################\n", + "\n", + "\n", + "import os\n", + "import json\n", + "from urllib.parse import urlparse\n", + "import pandas as pd\n", + "from glob import glob\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c2d95da-e123-45ef-b419-618550bd26a6", + "metadata": {}, + "outputs": [], + "source": [ + "# ATTENTION: In this environment relative paths don't work with glob. \n", + "# Loads a sample of randomly selected servers of one month for the\n", + "# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n", + "path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search – russia/*'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a", + "metadata": {}, + "outputs": [], + "source": [ + "# Loading data for analysis\n", + "# Load data into variable 'data'\n", + "data = []\n", + "for fi in glob(path):\n", + " if os.path.isfile(fi) and fi.endswith('.json'):\n", + " with open(fi, 'r') as infile:\n", + " data += json.load(infile)\n", + "\n", + "# Load 'data' in a data frame \n", + "df = pd.json_normalize(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "975a131c-d7df-4fb3-8b51-749355bdeaed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n", + "# That is necessary for date specific filtering\n", + "df['created_at'] = pd.to_datetime(df['created_at'], utc=True)\n", + "\n", + "# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n", + "df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c98352d", + "metadata": {}, + "outputs": [], + "source": [ + "## Top 25 posters regarding amount of messages\n", + "\n", + "top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n", + "top_25_tags.plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03", + "metadata": {}, + "source": [ + "## Stats " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print('Posts in Total:', len(df)) \n", + "print('Amount of individual users:',len(pd.unique(df['account.username'])))\n", + "print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Visualization\n", + "# Show amount of toots sorted by (calendar) week\n", + "df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n", + "df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")" + ] + }, + { + "cell_type": "markdown", + "id": "2c3177c7-9af8-47de-9f84-97c2f930e59a", + "metadata": {}, + "source": [ + "## Hashtags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Analysis of Hashtags\n", + "# Create 'tags' data frame\n", + "tags = []\n", + "for row in df.iterrows():\n", + " for tag in row[1]['tags']:\n", + " tag['toot_id'] = row[1]['id']\n", + " tag['toot_created_at'] = row[1]['created_at']\n", + " tag['toot_account.id'] = row[1]['account.id'] \n", + " tags.append(tag)\n", + "\n", + "tags = pd.json_normalize(tags)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7a66a41-6913-4cea-8dbf-489a0a055530", + "metadata": {}, + "outputs": [], + "source": [ + "# Co-Hashtags sorted by frequency\n", + "print(tags.groupby('name').size().sort_values(ascending=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b58f3df-0890-4022-a02e-d849d35f104d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Show Top 10 Toots as bar plot\n", + "#find values with top 10 occurrences in 'my_column'\n", + "\n", + "top_25_tags = (tags['name'].value_counts()).iloc[:25]\n", + "top_25_tags.plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "id": "39227e98-8e93-415e-8d83-e0f832e0bb13", + "metadata": {}, + "source": [ + "## Federation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# extracting domains and suffix from 'urls'\n", + "# Rank them by amount\n", + "print(df.groupby('instance').size().sort_values(ascending=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba8d4f89-e24a-4133-a801-c3fe0104371d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "\n", + "top_25_instances = (df['instance'].value_counts()).iloc[:25]\n", + "top_25_instances.plot(kind='bar')\n", + "\n", + "# mastodonweite Analysen können nun detaillierter werden." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis_notebook_local+public.ipynb b/analysis_notebook_local+public.ipynb new file mode 100644 index 0000000..0c087d7 --- /dev/null +++ b/analysis_notebook_local+public.ipynb @@ -0,0 +1,471 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6ad03636-9c4b-48f1-b85d-6ae8527d9984", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###################################################\n", + "#___ ___ _ _ _ #\n", + "#| \\/ | | | | | | | #\n", + "#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n", + "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n", + "#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n", + "#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n", + "################################################### \n", + "# This script loads data from the public and local tag timeline.\n", + "# Please fill in the correct path, where your data \n", + "# is stored.\n", + "#######################################################\n", + "# At the current stage, mastodata data loaders are meant\n", + "# to briefly explore collected data, filtered and then \n", + "# exported. You can filter by date and hashtag, for \n", + "# public and local timeline.\n", + "#######################################################\n", + "# There are three analytical sections in this notebook\n", + "# for each timeline.\n", + "# 1. Descriptive overview of the material\n", + "# 2. Co-Hashtags\n", + "# 3. Federation\n", + "#######################################################\n", + "\n", + "import os\n", + "import json\n", + "import pandas as pd\n", + "from glob import glob\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "e8bc49e9-341e-4507-9221-a1bf590921e5", + "metadata": {}, + "source": [ + "## Data for 'timeline_public'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de8f017c-3d2c-4c97-a9bc-c586aebeb20c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Loading data: Public data is loaded to variable 'df_public'.\n", + "# Given path must lead directly to json files from the collection and end with '/*'\n", + "# E.g. 'YOUR/PATH/timeline_public/*' \n", + "path_public = \n", + "\n", + "data_public = []\n", + "for fi in glob(path_public):\n", + " if os.path.isfile(fi) and fi.endswith('.json'):\n", + " with open(fi, 'r') as infile:\n", + " data_public += json.load(infile)\n", + " \n", + "df_public = pd.json_normalize(data_public)" + ] + }, + { + "cell_type": "markdown", + "id": "e64f6782-35a9-406d-9cca-40ac40f35127", + "metadata": {}, + "source": [ + "## Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2de303b6-25d7-4212-9580-54cbd9c43cfe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Add average posts per MAU \n", + "print('Posts in Total:', len(df_public)) \n", + "print('Amount of individual users:',len(pd.unique(df_public['account.username'])))\n", + "print('Average posts per user:',len(df_public)/len(pd.unique(df_public['account.username'])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e8e561d-1783-4cd5-accd-1a8f3de6c973", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Show amount of toots sorted by (calendar) week\n", + "# Note: Week twenty three is not fully part of the data set. Therefore that week is incomplete. Keep that in mind during analysis.\n", + "df_public_dates = df_public[\"created_at\"].astype(\"datetime64\")\n", + "df_public_dates.groupby(df_public_dates.dt.isocalendar().week).count().plot(kind=\"bar\")\n", + "\n", + "## Keep in mind, week 23 was not fully archived. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b54a52b0-da74-4119-a08b-a6b35bb1bb6d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Postings distributed by weekdays \n", + "df_public_dates.groupby(df_public_dates.dt.isocalendar().day).count().plot(kind=\"bar\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c9c0fe9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fd7e75d-83e3-4073-b39f-16b93b82a694", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Date specific filtering\n", + "############################\n", + "# Please insert the time frame by \n", + "# specifying a start date and an end date \n", + "\n", + "#start_date = \n", + "#end_date = \n", + "\n", + "\n", + "#mask = (df_public['created_at'] > start_date) & (df_public['created_at'] <= end_date)\n", + "\n", + "# Uncommment below to overwrite existing data frame\n", + "\n", + "#df_public = df_public.loc[mask]\n", + "\n", + "# Uncomment below to create subset called 'filtered_df'\n", + "#filtered_df = df_public.loc[mask]" + ] + }, + { + "cell_type": "markdown", + "id": "8497e393-ac7f-45e8-824e-3cbc874fb2e2", + "metadata": {}, + "source": [ + "## Hashtag analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d299dd8a-ca4c-4069-94d7-b4dfd5540897", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. \n", + "# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. \n", + "# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle \n", + "# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').\n", + "\n", + "tags_public = []\n", + "for row in df_public.iterrows():\n", + " for tag in row[1]['tags']:\n", + " tag['toot_id'] = row[1]['id']\n", + " tag['toot_created_at'] = row[1]['created_at']\n", + " tag['toot_account.id'] = row[1]['account.id'] \n", + " tags_public.append(tag)\n", + "\n", + "tags_public = pd.json_normalize(tags_public)\n", + "# For a first look\n", + "tags_public.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9d406d-458e-4cc4-b6b7-85434e66dc98", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Hashtags sorted by frequency\n", + "print(tags_public.groupby('name').size().sort_values(ascending=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2345142d-b9a2-4011-822e-81dca9000034", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Show Top 10 Toots as bar plot\n", + "#find values with top 10 occurrences in 'my_column'\n", + "top_10 = (tags_public['name'].value_counts()).iloc[:10]\n", + "top_10.plot(kind='bar')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d37aab", + "metadata": {}, + "outputs": [], + "source": [ + "# Show top 25 Users by amount of posts\n", + "\n", + "top_25_users = (df_public['account.username'].value_counts()).iloc[:25]\n", + "top_25_users.plot(kind='bar')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "262aa947-9b5c-4550-8b36-b09ed81a2ae7", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract toots containing a certain hashtag\n", + "# Insert hashtags tags to import in a seperate data set\n", + "# Hashtag name must be inserted within quotes!\n", + "\n", + "my_tags_public = []\n", + "\n", + "df_my_tags_public = tags_public[tags_public['name'].isin(my_tags_public)]\n", + "\n", + "df_my_tags_public.head() \n", + "\n", + "# uncomment the following command to create a csv-file for further examination\n", + "\n", + "#df_my_tags_public.to_csv('exports/my_tags_public.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "adab07a1-1ba4-407a-8151-cc890fe6603a", + "metadata": {}, + "source": [ + "## Data for 'timeline_local'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3131ef1f-6f8d-4dcb-a801-553019a79e90", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Loading data: Public data is loaded to variable 'df_public'.\n", + "# ATTENTION: In this environment relative paths don't work with glob. \n", + "path_local = '/var/jupyter-data/jupyter-mastodata/data/mastodon.social/timeline_local/*'\n", + "\n", + "data_local = []\n", + "for fi in glob(path_local):\n", + " if os.path.isfile(fi) and fi.endswith('.json'):\n", + " with open(fi, 'r') as infile:\n", + " data_local += json.load(infile)\n", + " \n", + "df_local = pd.json_normalize(data_local)" + ] + }, + { + "cell_type": "markdown", + "id": "10a6cafb-861e-4c8b-bc1f-e0fd5f67e7ce", + "metadata": {}, + "source": [ + "## Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5133131-1a74-4c61-8f3c-4044612a0394", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Add average posts per MAU\n", + "print('Posts in Total:', len(df_local)) \n", + "print('Amount of individual users:',len(pd.unique(df_local['account.username'])))\n", + "print('Average posts per user:',len(df_local)/len(pd.unique(df_local['account.username'])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3c20029-0d0a-48e8-9632-5bab4b21d99b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Change datetype of 'created_at'\n", + "df_local_dates = df_local[\"created_at\"].astype(\"datetime64\")\n", + "# Show amount of toots sorted by (calendar) week\n", + "df_local_dates.groupby(df_local_dates.dt.isocalendar().week).count().plot(kind=\"bar\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfbd773c-7cbc-4fe8-8343-934fe747faa2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Postings distributed by weekdays\n", + "df_local_dates.groupby(df_local_dates.dt.isocalendar().day).count().plot(kind=\"bar\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b99b490", + "metadata": {}, + "outputs": [], + "source": [ + "# Show top 25 Users by amount of posts\n", + "\n", + "top_25_users = (df_local['account.username'].value_counts()).iloc[:25]\n", + "top_25_users.plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "id": "f24fccb8-0cb6-4241-89a8-36cd714e235e", + "metadata": {}, + "source": [ + "## Hashtag analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f0c3c70-aeb5-4359-9319-f3467a8b7162", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. \n", + "# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. \n", + "# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle \n", + "# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').\n", + "\n", + "tags_local = []\n", + "for row in df_local.iterrows():\n", + " for tag in row[1]['tags']:\n", + " tag['toot_id'] = row[1]['id']\n", + " tag['toot_created_at'] = row[1]['created_at']\n", + " tag['toot_account.id'] = row[1]['account.id'] \n", + " tags_local.append(tag)\n", + "\n", + "tags_local = pd.json_normalize(tags_local)\n", + "tags_local.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "561054f2-d8db-47d9-aa15-c0ea5f06388d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Hashtags sorted by frequency\n", + "tags_local.groupby('name').size().sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69bba3a4-03b0-4cd1-8172-3418ad06b446", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Show Top 10 Toots as bar plot\n", + "#find values with top 10 occurrences in 'my_column'\n", + "top_10_tags = (tags_local['name'].value_counts()).iloc[:10]\n", + "top_10_tags.plot(kind='bar')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90785572", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54ed0e60-9349-4cf1-b987-04aa3aa07037", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract toots containing a certain hashtag\n", + "# Insert hashtags tags to import in a seperate data set\n", + "# Hashtag name must be inserted within quotes!\n", + "\n", + "my_tags_local = []\n", + "\n", + "df_my_tags_local = tags_local[tags_local['name'].isin(my_tags_local)]\n", + "\n", + "df_my_tags_local.head() \n", + "\n", + "# uncomment the following command to create a csv-file for further examination\n", + "\n", + "#df_my_tags_local.to_csv('exports/my_tags_local.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}