266 lines
7.4 KiB
Plaintext
266 lines
7.4 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9493a46f-12c3-4582-9723-7352e6aaeb95",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"###################################################\n",
|
||
"#___ ___ _ _ _ #\n",
|
||
"#| \\/ | | | | | | | #\n",
|
||
"#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n",
|
||
"#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n",
|
||
"#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n",
|
||
"#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n",
|
||
"####################################################### \n",
|
||
"# This script loads data from the hash tag timeline.\n",
|
||
"# Please fill in the correct path, where your data \n",
|
||
"# is stored.\n",
|
||
"#######################################################\n",
|
||
"# At the current stage, mastodata data loaders are meant\n",
|
||
"# to briefly explore collected data, filtered and then \n",
|
||
"# exported. You can filter by date and hashtag, for \n",
|
||
"# public and local timeline.\n",
|
||
"#######################################################\n",
|
||
"# This notebook loads data collected from the hashtag\n",
|
||
"# timeline. Currently there exist 3 collected hashtags:\n",
|
||
"# #russia, #ukraine and #fediblock.\n",
|
||
"# Posts were collected from EVERY instance avilable on\n",
|
||
"#'fedidb.org'.\n",
|
||
"#######################################################\n",
|
||
"# Three analytical sections are in this notebook.\n",
|
||
"# 1. Descriptive overview of the material\n",
|
||
"# 2. Co-Hashtags\n",
|
||
"# 3. Federation\n",
|
||
"#######################################################\n",
|
||
"\n",
|
||
"\n",
|
||
"import os\n",
|
||
"import json\n",
|
||
"from urllib.parse import urlparse\n",
|
||
"import pandas as pd\n",
|
||
"from glob import glob\n",
|
||
"import matplotlib.pyplot as plt"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "4c2d95da-e123-45ef-b419-618550bd26a6",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ATTENTION: In this environment relative paths don't work with glob. \n",
|
||
"# Loads a sample of randomly selected servers of one month for the\n",
|
||
"# tag 'ukraine'. From 12-06-2024 - 12-07-2024.\n",
|
||
"path = f'/var/jupyter-data/jupyter-mastodata/data/hashtag search – russia/*'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "72310631-a1d1-42e1-a5c4-3cc5f3812a1a",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Loading data for analysis\n",
|
||
"# Load data into variable 'data'\n",
|
||
"data = []\n",
|
||
"for fi in glob(path):\n",
|
||
" if os.path.isfile(fi) and fi.endswith('.json'):\n",
|
||
" with open(fi, 'r') as infile:\n",
|
||
" data += json.load(infile)\n",
|
||
"\n",
|
||
"# Load 'data' in a data frame \n",
|
||
"df = pd.json_normalize(data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "975a131c-d7df-4fb3-8b51-749355bdeaed",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Changes 'created_at' to data format datetime64[ns, tzlocal()]\n",
|
||
"# That is necessary for date specific filtering\n",
|
||
"df['created_at'] = pd.to_datetime(df['created_at'], utc=True)\n",
|
||
"\n",
|
||
"# Adding the column 'instance' by extracting the domain name (and suffix) from the column 'url' of the post\n",
|
||
"df['instance'] = df['url'].apply(lambda x: urlparse(x)[1])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "6c98352d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"## Top 25 posters regarding amount of messages\n",
|
||
"\n",
|
||
"top_25_tags = (df['account.username'].value_counts()).iloc[:25]\n",
|
||
"top_25_tags.plot(kind='bar')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "ad8f6e0e-8e64-4d71-8f3e-b02b25517d03",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Stats "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "8c41b3b1-6bce-4f6b-8bfb-8a6ebc072776",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"print('Posts in Total:', len(df)) \n",
|
||
"print('Amount of individual users:',len(pd.unique(df['account.username'])))\n",
|
||
"print('Average posts per user:',len(df)/len(pd.unique(df['account.username'])))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "34109a10-061a-46ec-a7fd-b1f2ab9a4d8c",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Visualization\n",
|
||
"# Show amount of toots sorted by (calendar) week\n",
|
||
"df_dates = df['created_at'].apply(lambda x: pd.to_datetime(x).tz_convert('UTC'))\n",
|
||
"df_dates.groupby(df_dates.dt.isocalendar().week).count().plot(kind=\"bar\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "2c3177c7-9af8-47de-9f84-97c2f930e59a",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Hashtags"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "d83dd089-fdd3-43fd-98ac-c2972540c1b4",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Analysis of Hashtags\n",
|
||
"# Create 'tags' data frame\n",
|
||
"tags = []\n",
|
||
"for row in df.iterrows():\n",
|
||
" for tag in row[1]['tags']:\n",
|
||
" tag['toot_id'] = row[1]['id']\n",
|
||
" tag['toot_created_at'] = row[1]['created_at']\n",
|
||
" tag['toot_account.id'] = row[1]['account.id'] \n",
|
||
" tags.append(tag)\n",
|
||
"\n",
|
||
"tags = pd.json_normalize(tags)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c7a66a41-6913-4cea-8dbf-489a0a055530",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Co-Hashtags sorted by frequency\n",
|
||
"print(tags.groupby('name').size().sort_values(ascending=False))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5b58f3df-0890-4022-a02e-d849d35f104d",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Show Top 10 Toots as bar plot\n",
|
||
"#find values with top 10 occurrences in 'my_column'\n",
|
||
"\n",
|
||
"top_25_tags = (tags['name'].value_counts()).iloc[:25]\n",
|
||
"top_25_tags.plot(kind='bar')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "39227e98-8e93-415e-8d83-e0f832e0bb13",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Federation"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "977fd226-ee9d-4f5c-a102-df513bc8d7fc",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# extracting domains and suffix from 'urls'\n",
|
||
"# Rank them by amount\n",
|
||
"print(df.groupby('instance').size().sort_values(ascending=False))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ba8d4f89-e24a-4133-a801-c3fe0104371d",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"\n",
|
||
"\n",
|
||
"top_25_instances = (df['instance'].value_counts()).iloc[:25]\n",
|
||
"top_25_instances.plot(kind='bar')\n",
|
||
"\n",
|
||
"# mastodonweite Analysen können nun detaillierter werden."
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.14"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|