Add files via upload

Initial upload of Mastodata
This commit is contained in:
Almex
2024-09-05 17:37:42 +02:00
committed by GitHub
parent b8f02d76f3
commit 7eff3ea12d
6 changed files with 1514 additions and 1 deletions

View File

@@ -0,0 +1,265 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9493a46f-12c3-4582-9723-7352e6aaeb95",
"metadata": {},
"outputs": [],
"source": [
"###################################################\n",
"#___ ___ _ _ _ #\n",
"#| \\/ | | | | | | | #\n",
"#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n",
"#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n",
"#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n",
"#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n",
"####################################################### \n",
"# This script loads data from the hash tag timeline.\n",
"# Please fill in the correct path, where your data \n",
"# is stored.\n",
"#######################################################\n",
"# At the current stage, mastodata data loaders are meant\n",
"# to briefly explore collected data, filtered and then \n",
"# exported. You can filter by date and hashtag, for \n",
"# public and local timeline.\n",
"#######################################################\n",
"# This notebook loads data collected from the hashtag\n",
"# timeline. Currently there exist 3 collected hashtags:\n",
"# #russia, #ukraine and #fediblock.\n",
"# Posts were collected from EVERY instance avilable on\n",
"#'fedidb.org'.\n",
"#######################################################\n",
"# Three analytical sections are in this notebook.\n",
"# 1. Descriptive overview of the material\n",
"# 2. Co-Hashtags\n",
"# 3. Federation\n",
"#######################################################\n",
"\n",
"\n",
"import os\n",
"import json\n",
"from urllib.parse import urlparse\n",
"import pandas as pd\n",
"from glob import glob\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c2d95da-e123-45ef-b419-618550bd26a6",
"metadata": {},
"outputs": [],
"source": [
"# ATTENTION: In this environment relative paths don't work with glob. \n",
"# Loads a sample of randomly selected servers of one month for the\n",
"# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n",
"path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search russia/*'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a",
"metadata": {},
"outputs": [],
"source": [
"# Loading data for analysis\n",
"# Load data into variable 'data'\n",
"data = []\n",
"for fi in glob(path):\n",
" if os.path.isfile(fi) and fi.endswith('.json'):\n",
" with open(fi, 'r') as infile:\n",
" data += json.load(infile)\n",
"\n",
"# Load 'data' in a data frame \n",
"df = pd.json_normalize(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "975a131c-d7df-4fb3-8b51-749355bdeaed",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n",
"# That is necessary for date specific filtering\n",
"df['created_at'] = pd.to_datetime(df['created_at'], utc=True)\n",
"\n",
"# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n",
"df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c98352d",
"metadata": {},
"outputs": [],
"source": [
"## Top 25 posters regarding amount of messages\n",
"\n",
"top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n",
"top_25_tags.plot(kind='bar')"
]
},
{
"cell_type": "markdown",
"id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03",
"metadata": {},
"source": [
"## Stats "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print('Posts in Total:', len(df)) \n",
"print('Amount of individual users:',len(pd.unique(df['account.username'])))\n",
"print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Visualization\n",
"# Show amount of toots sorted by (calendar) week\n",
"df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n",
"df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")"
]
},
{
"cell_type": "markdown",
"id": "2c3177c7-9af8-47de-9f84-97c2f930e59a",
"metadata": {},
"source": [
"## Hashtags"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Analysis of Hashtags\n",
"# Create 'tags' data frame\n",
"tags = []\n",
"for row in df.iterrows():\n",
" for tag in row[1]['tags']:\n",
" tag['toot_id'] = row[1]['id']\n",
" tag['toot_created_at'] = row[1]['created_at']\n",
" tag['toot_account.id'] = row[1]['account.id'] \n",
" tags.append(tag)\n",
"\n",
"tags = pd.json_normalize(tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7a66a41-6913-4cea-8dbf-489a0a055530",
"metadata": {},
"outputs": [],
"source": [
"# Co-Hashtags sorted by frequency\n",
"print(tags.groupby('name').size().sort_values(ascending=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b58f3df-0890-4022-a02e-d849d35f104d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Show Top 10 Toots as bar plot\n",
"#find values with top 10 occurrences in 'my_column'\n",
"\n",
"top_25_tags = (tags['name'].value_counts()).iloc[:25]\n",
"top_25_tags.plot(kind='bar')"
]
},
{
"cell_type": "markdown",
"id": "39227e98-8e93-415e-8d83-e0f832e0bb13",
"metadata": {},
"source": [
"## Federation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# extracting domains and suffix from 'urls'\n",
"# Rank them by amount\n",
"print(df.groupby('instance').size().sort_values(ascending=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba8d4f89-e24a-4133-a801-c3fe0104371d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"\n",
"\n",
"top_25_instances = (df['instance'].value_counts()).iloc[:25]\n",
"top_25_instances.plot(kind='bar')\n",
"\n",
"# mastodonweite Analysen können nun detaillierter werden."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}