{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9493a46f-12c3-4582-9723-7352e6aaeb95", "metadata": {}, "outputs": [], "source": [ "###################################################\n", "#___ ___ _ _ _ #\n", "#| \\/ | | | | | | | #\n", "#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n", "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n", "#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n", "#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n", "####################################################### \n", "# written by Alexander Martin and Marcus Burkhardt #\n", "# This script loads data from the hash tag timeline.\n", "# Please fill in the correct path, where your data \n", "# is stored.\n", "#######################################################\n", "# At the current stage, mastodata data loaders are meant\n", "# to briefly explore collected data, filtered and then \n", "# exported. You can filter by date and hashtag, for \n", "# public and local timeline.\n", "#######################################################\n", "# This notebook loads data collected from the hashtag\n", "# timeline. Currently there exist 3 collected hashtags:\n", "# #russia, #ukraine and #fediblock.\n", "# Posts were collected from EVERY instance avilable on\n", "#'fedidb.org'.\n", "#######################################################\n", "# Three analytical sections are in this notebook.\n", "# 1. Descriptive overview of the material\n", "# 2. Co-Hashtags\n", "# 3. Federation\n", "#######################################################\n", "\n", "\n", "import os\n", "import json\n", "from urllib.parse import urlparse\n", "import pandas as pd\n", "from glob import glob\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "id": "4c2d95da-e123-45ef-b419-618550bd26a6", "metadata": {}, "outputs": [], "source": [ "# ATTENTION: In this environment relative paths don't work with glob. \n", "# Loads a sample of randomly selected servers of one month for the\n", "# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n", "path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search – russia/*'" ] }, { "cell_type": "code", "execution_count": null, "id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a", "metadata": {}, "outputs": [], "source": [ "# Loading data for analysis\n", "# Load data into variable 'data'\n", "data = []\n", "for fi in glob(path):\n", " if os.path.isfile(fi) and fi.endswith('.json'):\n", " with open(fi, 'r') as infile:\n", " data += json.load(infile)\n", "\n", "# Load 'data' in a data frame \n", "df = pd.json_normalize(data)" ] }, { "cell_type": "code", "execution_count": null, "id": "975a131c-d7df-4fb3-8b51-749355bdeaed", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n", "# That is necessary for date specific filtering\n", "df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True)\n", "\n", "# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n", "df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])" ] }, { "cell_type": "code", "execution_count": null, "id": "6c98352d", "metadata": {}, "outputs": [], "source": [ "## Top 25 posters regarding amount of messages\n", "\n", "top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n", "top_25_tags.plot(kind='bar')" ] }, { "cell_type": "markdown", "id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03", "metadata": {}, "source": [ "## Stats " ] }, { "cell_type": "code", "execution_count": null, "id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776", "metadata": { "tags": [] }, "outputs": [], "source": [ "print('Posts in Total:', len(df)) \n", "print('Amount of individual users:',len(pd.unique(df['account.username'])))\n", "print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))" ] }, { "cell_type": "code", "execution_count": null, "id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Visualization\n", "# Show amount of toots sorted by (calendar) week\n", "df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n", "df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")" ] }, { "cell_type": "markdown", "id": "2c3177c7-9af8-47de-9f84-97c2f930e59a", "metadata": {}, "source": [ "## Hashtags" ] }, { "cell_type": "code", "execution_count": null, "id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Analysis of Hashtags\n", "# Create 'tags' data frame\n", "tags = []\n", "for row in df.iterrows():\n", " for tag in row[1]['tags']:\n", " tag['toot_id'] = row[1]['id']\n", " tag['toot_created_at'] = row[1]['created_at']\n", " tag['toot_account.id'] = row[1]['account.id'] \n", " tags.append(tag)\n", "\n", "tags = pd.json_normalize(tags)" ] }, { "cell_type": "code", "execution_count": null, "id": "c7a66a41-6913-4cea-8dbf-489a0a055530", "metadata": {}, "outputs": [], "source": [ "# Co-Hashtags sorted by frequency\n", "print(tags.groupby('name').size().sort_values(ascending=False))" ] }, { "cell_type": "code", "execution_count": null, "id": "5b58f3df-0890-4022-a02e-d849d35f104d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Show Top 10 Toots as bar plot\n", "#find values with top 10 occurrences in 'my_column'\n", "\n", "top_25_tags = (tags['name'].value_counts()).iloc[:25]\n", "top_25_tags.plot(kind='bar')" ] }, { "cell_type": "markdown", "id": "39227e98-8e93-415e-8d83-e0f832e0bb13", "metadata": {}, "source": [ "## Federation" ] }, { "cell_type": "code", "execution_count": null, "id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc", "metadata": { "tags": [] }, "outputs": [], "source": [ "# extracting domains and suffix from 'urls'\n", "# Rank them by amount\n", "print(df.groupby('instance').size().sort_values(ascending=False))" ] }, { "cell_type": "code", "execution_count": null, "id": "ba8d4f89-e24a-4133-a801-c3fe0104371d", "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "\n", "top_25_instances = (df['instance'].value_counts()).iloc[:25]\n", "top_25_instances.plot(kind='bar')\n", "\n", "# mastodonweite Analysen können nun detaillierter werden." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }