Files
Mastodata/analysis_notebook_hashtag.ipynb
Almex c14d637b2f Update analysis_notebook_hashtag.ipynb
Fix of date time transformation for date specific filtering.
2024-09-12 09:48:05 +02:00

267 lines
7.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9493a46f-12c3-4582-9723-7352e6aaeb95",
"metadata": {},
"outputs": [],
"source": [
"###################################################\n",
"#___ ___ _ _ _ #\n",
"#| \\/ | | | | | | | #\n",
"#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n",
"#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n",
"#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n",
"#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n",
"####################################################### \n",
"# written by Alexander Martin and Marcus Burkhardt #\n",
"# This script loads data from the hash tag timeline.\n",
"# Please fill in the correct path, where your data \n",
"# is stored.\n",
"#######################################################\n",
"# At the current stage, mastodata data loaders are meant\n",
"# to briefly explore collected data, filtered and then \n",
"# exported. You can filter by date and hashtag, for \n",
"# public and local timeline.\n",
"#######################################################\n",
"# This notebook loads data collected from the hashtag\n",
"# timeline. Currently there exist 3 collected hashtags:\n",
"# #russia, #ukraine and #fediblock.\n",
"# Posts were collected from EVERY instance avilable on\n",
"#'fedidb.org'.\n",
"#######################################################\n",
"# Three analytical sections are in this notebook.\n",
"# 1. Descriptive overview of the material\n",
"# 2. Co-Hashtags\n",
"# 3. Federation\n",
"#######################################################\n",
"\n",
"\n",
"import os\n",
"import json\n",
"from urllib.parse import urlparse\n",
"import pandas as pd\n",
"from glob import glob\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c2d95da-e123-45ef-b419-618550bd26a6",
"metadata": {},
"outputs": [],
"source": [
"# ATTENTION: In this environment relative paths don't work with glob. \n",
"# Loads a sample of randomly selected servers of one month for the\n",
"# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n",
"path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search russia/*'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a",
"metadata": {},
"outputs": [],
"source": [
"# Loading data for analysis\n",
"# Load data into variable 'data'\n",
"data = []\n",
"for fi in glob(path):\n",
" if os.path.isfile(fi) and fi.endswith('.json'):\n",
" with open(fi, 'r') as infile:\n",
" data += json.load(infile)\n",
"\n",
"# Load 'data' in a data frame \n",
"df = pd.json_normalize(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "975a131c-d7df-4fb3-8b51-749355bdeaed",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n",
"# That is necessary for date specific filtering\n",
"df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True)\n",
"\n",
"# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n",
"df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c98352d",
"metadata": {},
"outputs": [],
"source": [
"## Top 25 posters regarding amount of messages\n",
"\n",
"top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n",
"top_25_tags.plot(kind='bar')"
]
},
{
"cell_type": "markdown",
"id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03",
"metadata": {},
"source": [
"## Stats "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print('Posts in Total:', len(df)) \n",
"print('Amount of individual users:',len(pd.unique(df['account.username'])))\n",
"print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Visualization\n",
"# Show amount of toots sorted by (calendar) week\n",
"df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n",
"df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")"
]
},
{
"cell_type": "markdown",
"id": "2c3177c7-9af8-47de-9f84-97c2f930e59a",
"metadata": {},
"source": [
"## Hashtags"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Analysis of Hashtags\n",
"# Create 'tags' data frame\n",
"tags = []\n",
"for row in df.iterrows():\n",
" for tag in row[1]['tags']:\n",
" tag['toot_id'] = row[1]['id']\n",
" tag['toot_created_at'] = row[1]['created_at']\n",
" tag['toot_account.id'] = row[1]['account.id'] \n",
" tags.append(tag)\n",
"\n",
"tags = pd.json_normalize(tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7a66a41-6913-4cea-8dbf-489a0a055530",
"metadata": {},
"outputs": [],
"source": [
"# Co-Hashtags sorted by frequency\n",
"print(tags.groupby('name').size().sort_values(ascending=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b58f3df-0890-4022-a02e-d849d35f104d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Show Top 10 Toots as bar plot\n",
"#find values with top 10 occurrences in 'my_column'\n",
"\n",
"top_25_tags = (tags['name'].value_counts()).iloc[:25]\n",
"top_25_tags.plot(kind='bar')"
]
},
{
"cell_type": "markdown",
"id": "39227e98-8e93-415e-8d83-e0f832e0bb13",
"metadata": {},
"source": [
"## Federation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# extracting domains and suffix from 'urls'\n",
"# Rank them by amount\n",
"print(df.groupby('instance').size().sort_values(ascending=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba8d4f89-e24a-4133-a801-c3fe0104371d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"\n",
"\n",
"top_25_instances = (df['instance'].value_counts()).iloc[:25]\n",
"top_25_instances.plot(kind='bar')\n",
"\n",
"# mastodonweite Analysen können nun detaillierter werden."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}