In [None]:
###################################################
#___  ___          _            _       _         #
#|  \/  |         | |          | |     | |        #
#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #
#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
#| |  | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
#\_|  |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
################################################### 
# written by Alexander Martin and Marcus Burkhardt #
# This script loads data from the public and local tag timeline.
# Please fill in the correct path, where your data 
# is stored.
#######################################################
# At the current stage, mastodata data loaders are meant
# to briefly explore collected data, filtered and then 
# exported. You can filter by date and hashtag, for 
# public and local timeline.
#######################################################
# There are three analytical sections in this notebook
# for each timeline.
# 1. Descriptive overview of the material
# 2. Co-Hashtags
# 3. Federation
#######################################################

import os
import json
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt

## Data for 'timeline_public'

In [None]:
# Loading data: Public data is loaded to variable 'df_public'.
# Given path must lead directly to json files from the collection and end with '/*'
# E.g. 'YOUR/PATH/timeline_public/*'  
path_public = 

data_public = []
for fi in glob(path_public):
    if os.path.isfile(fi) and fi.endswith('.json'):
        with open(fi, 'r') as infile:
            data_public += json.load(infile)
            
df_public = pd.json_normalize(data_public)

## Stats

In [None]:
# Add average posts per MAU 
print('Posts in Total:', len(df_public)) 
print('Amount of individual users:',len(pd.unique(df_public['account.username'])))
print('Average posts per user:',len(df_public)/len(pd.unique(df_public['account.username'])))

In [None]:
# Show amount of toots sorted by (calendar) week
# Note: Week twenty three is not fully part of the data set. Therefore that week is incomplete. Keep that in mind during analysis.
df_public_dates = df_public["created_at"].astype("datetime64")
df_public_dates.groupby(df_public_dates.dt.isocalendar().week).count().plot(kind="bar")

## Keep in mind, week 23 was not fully archived. 

In [None]:
# Postings distributed by weekdays 
df_public_dates.groupby(df_public_dates.dt.isocalendar().day).count().plot(kind="bar")

In [None]:
# Date specific filtering
############################
# Please insert the time frame by 
# specifying a start date and an end date 

#start_date = 
#end_date = 


#mask = (df_public['created_at'] > start_date) & (df_public['created_at'] <= end_date)

# Uncommment below to overwrite existing data frame

#df_public = df_public.loc[mask]

# Uncomment below to create subset called 'filtered_df'
#filtered_df = df_public.loc[mask]

## Hashtag analysis

In [None]:
# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. 
# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. 
# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle 
# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').

tags_public = []
for row in df_public.iterrows():
    for tag in row[1]['tags']:
        tag['toot_id'] = row[1]['id']
        tag['toot_created_at'] = row[1]['created_at']
        tag['toot_account.id'] = row[1]['account.id']    
        tags_public.append(tag)

tags_public = pd.json_normalize(tags_public)
# For a first look
tags_public.head(5)

In [None]:
# Hashtags sorted by frequency
print(tags_public.groupby('name').size().sort_values(ascending=False))

In [None]:
# Show Top 10 Toots as bar plot
#find values with top 10 occurrences in 'my_column'
top_10 = (tags_public['name'].value_counts()).iloc[:10]
top_10.plot(kind='bar')



In [None]:
# Show top 25 Users by amount of posts

top_25_users = (df_public['account.username'].value_counts()).iloc[:25]
top_25_users.plot(kind='bar')

In [None]:
# Extract toots containing a certain hashtag
# Insert hashtags tags to import in a seperate data set
# Hashtag name must be inserted within quotes!

my_tags_public = []

df_my_tags_public = tags_public[tags_public['name'].isin(my_tags_public)]

df_my_tags_public.head()    

# uncomment the following command to create a csv-file for further examination

#df_my_tags_public.to_csv('exports/my_tags_public.csv')

## Data for 'timeline_local'

In [None]:
# Loading data: Public data is loaded to variable 'df_public'.
# ATTENTION: In this environment relative paths don't work with glob. 
path_local = '/var/jupyter-data/jupyter-mastodata/data/mastodon.social/timeline_local/*'

data_local = []
for fi in glob(path_local):
    if os.path.isfile(fi) and fi.endswith('.json'):
        with open(fi, 'r') as infile:
            data_local += json.load(infile)
            
df_local = pd.json_normalize(data_local)

## Stats

In [None]:
# Add average posts per MAU
print('Posts in Total:', len(df_local))  
print('Amount of individual users:',len(pd.unique(df_local['account.username'])))
print('Average posts per user:',len(df_local)/len(pd.unique(df_local['account.username'])))

In [None]:
# Change datetype of 'created_at'
df_local_dates = df_local["created_at"].astype("datetime64")
# Show amount of toots sorted by (calendar) week
df_local_dates.groupby(df_local_dates.dt.isocalendar().week).count().plot(kind="bar")

In [None]:
# Postings distributed by weekdays
df_local_dates.groupby(df_local_dates.dt.isocalendar().day).count().plot(kind="bar")

In [None]:
# Show top 25 Users by amount of posts

top_25_users = (df_local['account.username'].value_counts()).iloc[:25]
top_25_users.plot(kind='bar')

## Hashtag analysis

In [None]:
# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. 
# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. 
# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle 
# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').

tags_local = []
for row in df_local.iterrows():
    for tag in row[1]['tags']:
        tag['toot_id'] = row[1]['id']
        tag['toot_created_at'] = row[1]['created_at']
        tag['toot_account.id'] = row[1]['account.id']    
        tags_local.append(tag)

tags_local = pd.json_normalize(tags_local)
tags_local.head(5)

In [None]:
# Hashtags sorted by frequency
tags_local.groupby('name').size().sort_values(ascending=False)

In [None]:
# Show Top 10 Toots as bar plot
#find values with top 10 occurrences in 'my_column'
top_10_tags = (tags_local['name'].value_counts()).iloc[:10]
top_10_tags.plot(kind='bar')



In [None]:
# Extract toots containing a certain hashtag
# Insert hashtags tags to import in a seperate data set
# Hashtag name must be inserted within quotes!

my_tags_local = []

df_my_tags_local = tags_local[tags_local['name'].isin(my_tags_local)]

df_my_tags_local.head()    

# uncomment the following command to create a csv-file for further examination

#df_my_tags_local.to_csv('exports/my_tags_local.csv')