Mastodata/analysis_notebook_hashtag.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9493a46f-12c3-4582-9723-7352e6aaeb95",
   "metadata": {},
   "outputs": [],
   "source": [
    "###################################################\n",
    "#___  ___          _            _       _         #\n",
    "#|  \\/  |         | |          | |     | |        #\n",
    "#| .  . | __ _ ___| |_ ___   __| | __ _| |_ __ _  #\n",
    "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n",
    "#| |  | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n",
    "#\\_|  |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n",
    "####################################################### \n",
    "# written by Alexander Martin and Marcus Burkhardt #\n",
    "# This script loads data from the hash tag timeline.\n",
    "# Please fill in the correct path, where your data \n",
    "# is stored.\n",
    "#######################################################\n",
    "# At the current stage, mastodata data loaders are meant\n",
    "# to briefly explore collected data, filtered and then \n",
    "# exported. You can filter by date and hashtag, for \n",
    "# public and local timeline.\n",
    "#######################################################\n",
    "# This notebook loads data collected from the hashtag\n",
    "# timeline. Currently there exist 3 collected hashtags:\n",
    "# #russia, #ukraine and #fediblock.\n",
    "# Posts were collected from EVERY instance avilable on\n",
    "#'fedidb.org'.\n",
    "#######################################################\n",
    "# Three analytical sections are in this notebook.\n",
    "# 1. Descriptive overview of the material\n",
    "# 2. Co-Hashtags\n",
    "# 3. Federation\n",
    "#######################################################\n",
    "\n",
    "\n",
    "import os\n",
    "import json\n",
    "from urllib.parse import urlparse\n",
    "import pandas as pd\n",
    "from glob import glob\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c2d95da-e123-45ef-b419-618550bd26a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ATTENTION: In this environment relative paths don't work with glob. \n",
    "# Loads a sample of randomly selected servers of one month for the\n",
    "# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n",
    "path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search – russia/*'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loading data for analysis\n",
    "# Load data into variable 'data'\n",
    "data = []\n",
    "for fi in glob(path):\n",
    "    if os.path.isfile(fi) and fi.endswith('.json'):\n",
    "        with open(fi, 'r') as infile:\n",
    "            data += json.load(infile)\n",
    "\n",
    "# Load 'data' in a data frame            \n",
    "df = pd.json_normalize(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "975a131c-d7df-4fb3-8b51-749355bdeaed",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n",
    "# That is necessary for date specific filtering\n",
    "df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce',  utc=True)\n",
    "\n",
    "# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n",
    "df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c98352d",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Top 25 posters regarding amount of messages\n",
    "\n",
    "top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n",
    "top_25_tags.plot(kind='bar')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03",
   "metadata": {},
   "source": [
    "## Stats "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print('Posts in Total:', len(df)) \n",
    "print('Amount of individual users:',len(pd.unique(df['account.username'])))\n",
    "print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Visualization\n",
    "# Show amount of toots sorted by (calendar) week\n",
    "df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n",
    "df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c3177c7-9af8-47de-9f84-97c2f930e59a",
   "metadata": {},
   "source": [
    "## Hashtags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Analysis of Hashtags\n",
    "# Create 'tags' data frame\n",
    "tags = []\n",
    "for row in df.iterrows():\n",
    "    for tag in row[1]['tags']:\n",
    "        tag['toot_id'] = row[1]['id']\n",
    "        tag['toot_created_at'] = row[1]['created_at']\n",
    "        tag['toot_account.id'] = row[1]['account.id']    \n",
    "        tags.append(tag)\n",
    "\n",
    "tags = pd.json_normalize(tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7a66a41-6913-4cea-8dbf-489a0a055530",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Co-Hashtags sorted by frequency\n",
    "print(tags.groupby('name').size().sort_values(ascending=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b58f3df-0890-4022-a02e-d849d35f104d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Show Top 10 Toots as bar plot\n",
    "#find values with top 10 occurrences in 'my_column'\n",
    "\n",
    "top_25_tags = (tags['name'].value_counts()).iloc[:25]\n",
    "top_25_tags.plot(kind='bar')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39227e98-8e93-415e-8d83-e0f832e0bb13",
   "metadata": {},
   "source": [
    "## Federation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# extracting domains and suffix from 'urls'\n",
    "# Rank them by amount\n",
    "print(df.groupby('instance').size().sort_values(ascending=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba8d4f89-e24a-4133-a801-c3fe0104371d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "top_25_instances = (df['instance'].value_counts()).iloc[:25]\n",
    "top_25_instances.plot(kind='bar')\n",
    "\n",
    "# mastodonweite Analysen können nun detaillierter werden."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}