Add files via upload

Initial upload of Mastodata
2024-09-05 17:37:42 +02:00
parent b8f02d76f3
commit 7eff3ea12d
6 changed files with 1514 additions and 1 deletions
--- a/analysis_notebook_hashtag.ipynb
+++ b/analysis_notebook_hashtag.ipynb
@@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9493a46f-12c3-4582-9723-7352e6aaeb95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "###################################################\n",
+    "#___  ___          _            _       _         #\n",
+    "#|  \\/  |         | |          | |     | |        #\n",
+    "#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #\n",
+    "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n",
+    "#| |  | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n",
+    "#\\_|  |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n",
+    "####################################################### \n",
+    "# This script loads data from the hash tag timeline.\n",
+    "# Please fill in the correct path, where your data \n",
+    "# is stored.\n",
+    "#######################################################\n",
+    "# At the current stage, mastodata data loaders are meant\n",
+    "# to briefly explore collected data, filtered and then \n",
+    "# exported. You can filter by date and hashtag, for \n",
+    "# public and local timeline.\n",
+    "#######################################################\n",
+    "# This notebook loads data collected from the hashtag\n",
+    "# timeline. Currently there exist 3 collected hashtags:\n",
+    "# #russia, #ukraine and #fediblock.\n",
+    "# Posts were collected from EVERY instance avilable on\n",
+    "#'fedidb.org'.\n",
+    "#######################################################\n",
+    "# Three analytical sections are in this notebook.\n",
+    "# 1. Descriptive overview of the material\n",
+    "# 2. Co-Hashtags\n",
+    "# 3. Federation\n",
+    "#######################################################\n",
+    "\n",
+    "\n",
+    "import os\n",
+    "import json\n",
+    "from urllib.parse import urlparse\n",
+    "import pandas as pd\n",
+    "from glob import glob\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c2d95da-e123-45ef-b419-618550bd26a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ATTENTION: In this environment relative paths don't work with glob. \n",
+    "# Loads a sample of randomly selected servers of one month for the\n",
+    "# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n",
+    "path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search – russia/*'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loading data for analysis\n",
+    "# Load data into variable 'data'\n",
+    "data = []\n",
+    "for fi in glob(path):\n",
+    "    if os.path.isfile(fi) and fi.endswith('.json'):\n",
+    "        with open(fi, 'r') as infile:\n",
+    "            data += json.load(infile)\n",
+    "\n",
+    "# Load 'data' in a data frame            \n",
+    "df = pd.json_normalize(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "975a131c-d7df-4fb3-8b51-749355bdeaed",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n",
+    "# That is necessary for date specific filtering\n",
+    "df['created_at'] = pd.to_datetime(df['created_at'], utc=True)\n",
+    "\n",
+    "# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n",
+    "df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c98352d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Top 25 posters regarding amount of messages\n",
+    "\n",
+    "top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n",
+    "top_25_tags.plot(kind='bar')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03",
+   "metadata": {},
+   "source": [
+    "## Stats "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "print('Posts in Total:', len(df)) \n",
+    "print('Amount of individual users:',len(pd.unique(df['account.username'])))\n",
+    "print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Visualization\n",
+    "# Show amount of toots sorted by (calendar) week\n",
+    "df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n",
+    "df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c3177c7-9af8-47de-9f84-97c2f930e59a",
+   "metadata": {},
+   "source": [
+    "## Hashtags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Analysis of Hashtags\n",
+    "# Create 'tags' data frame\n",
+    "tags = []\n",
+    "for row in df.iterrows():\n",
+    "    for tag in row[1]['tags']:\n",
+    "        tag['toot_id'] = row[1]['id']\n",
+    "        tag['toot_created_at'] = row[1]['created_at']\n",
+    "        tag['toot_account.id'] = row[1]['account.id']    \n",
+    "        tags.append(tag)\n",
+    "\n",
+    "tags = pd.json_normalize(tags)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7a66a41-6913-4cea-8dbf-489a0a055530",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Co-Hashtags sorted by frequency\n",
+    "print(tags.groupby('name').size().sort_values(ascending=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b58f3df-0890-4022-a02e-d849d35f104d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Show Top 10 Toots as bar plot\n",
+    "#find values with top 10 occurrences in 'my_column'\n",
+    "\n",
+    "top_25_tags = (tags['name'].value_counts()).iloc[:25]\n",
+    "top_25_tags.plot(kind='bar')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39227e98-8e93-415e-8d83-e0f832e0bb13",
+   "metadata": {},
+   "source": [
+    "## Federation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# extracting domains and suffix from 'urls'\n",
+    "# Rank them by amount\n",
+    "print(df.groupby('instance').size().sort_values(ascending=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba8d4f89-e24a-4133-a801-c3fe0104371d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "top_25_instances = (df['instance'].value_counts()).iloc[:25]\n",
+    "top_25_instances.plot(kind='bar')\n",
+    "\n",
+    "# mastodonweite Analysen können nun detaillierter werden."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}