In [None]:
###################################################
#___  ___          _            _       _         #
#|  \/  |         | |          | |     | |        #
#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #
#| |\/| |/ _` / __| __/ _ \ / _` |/ _` | __/ _` | #
#| |  | | (_| \__ \ || (_) | (_| | (_| | || (_| | #
#\_|  |_/\__,_|___/\__\___/ \__,_|\__,_|\__\__,_| #
####################################################### 
# This script loads data from the hash tag timeline.
# Please fill in the correct path, where your data 
# is stored.
#######################################################
# At the current stage, mastodata data loaders are meant
# to briefly explore collected data, filtered and then 
# exported. You can filter by date and hashtag, for 
# public and local timeline.
#######################################################
# This notebook loads data collected from the hashtag
# timeline. Currently there exist 3 collected hashtags:
# #russia, #ukraine and #fediblock.
# Posts were collected from EVERY instance avilable on
#'fedidb.org'.
#######################################################
# Three analytical sections are in this notebook.
# 1. Descriptive overview of the material
# 2. Co-Hashtags
# 3. Federation
#######################################################


import os
import json
from urllib.parse import urlparse
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt

In [None]:
# ATTENTION: In this environment relative paths don't work with glob. 
# Loads a sample of randomly selected servers of one month for the
# tag 'ukraine'. From 12-06-2024 - 12-07-2024.
path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search – russia/*'

In [None]:
# Loading data for analysis
# Load data into variable 'data'
data = []
for fi in glob(path):
    if os.path.isfile(fi) and fi.endswith('.json'):
        with open(fi, 'r') as infile:
            data += json.load(infile)

# Load 'data' in a data frame            
df = pd.json_normalize(data)

In [None]:
# Changes 'created_at' to data format datetime64[ns, tzlocal()]
# That is necessary for date specific filtering
df['created_at'] = pd.to_datetime(df['created_at'], utc=True)

# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post
df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])

In [None]:
## Top 25 posters regarding amount of messages

top_25_tags = (df['account.username'].value_counts()).iloc[:25]
top_25_tags.plot(kind='bar')

## Stats 

In [None]:
print('Posts in Total:', len(df)) 
print('Amount of individual users:',len(pd.unique(df['account.username'])))
print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))

In [None]:
# Visualization
# Show amount of toots sorted by (calendar) week
df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))
df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind="bar")

## Hashtags

In [None]:
# Analysis of Hashtags
# Create 'tags' data frame
tags = []
for row in df.iterrows():
    for tag in row[1]['tags']:
        tag['toot_id'] = row[1]['id']
        tag['toot_created_at'] = row[1]['created_at']
        tag['toot_account.id'] = row[1]['account.id']    
        tags.append(tag)

tags = pd.json_normalize(tags)

In [None]:
# Co-Hashtags sorted by frequency
print(tags.groupby('name').size().sort_values(ascending=False))

In [None]:
# Show Top 10 Toots as bar plot
#find values with top 10 occurrences in 'my_column'

top_25_tags = (tags['name'].value_counts()).iloc[:25]
top_25_tags.plot(kind='bar')

## Federation

In [None]:
# extracting domains and suffix from 'urls'
# Rank them by amount
print(df.groupby('instance').size().sort_values(ascending=False))

In [None]:


top_25_instances = (df['instance'].value_counts()).iloc[:25]
top_25_instances.plot(kind='bar')

# mastodonweite Analysen können nun detaillierter werden.