Add files via upload
Initial upload of Mastodata
This commit is contained in:
471
analysis_notebook_local+public.ipynb
Normal file
471
analysis_notebook_local+public.ipynb
Normal file
@@ -0,0 +1,471 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6ad03636-9c4b-48f1-b85d-6ae8527d9984",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"###################################################\n",
|
||||
"#___ ___ _ _ _ #\n",
|
||||
"#| \\/ | | | | | | | #\n",
|
||||
"#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n",
|
||||
"#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n",
|
||||
"#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n",
|
||||
"#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n",
|
||||
"################################################### \n",
|
||||
"# This script loads data from the public and local tag timeline.\n",
|
||||
"# Please fill in the correct path, where your data \n",
|
||||
"# is stored.\n",
|
||||
"#######################################################\n",
|
||||
"# At the current stage, mastodata data loaders are meant\n",
|
||||
"# to briefly explore collected data, filtered and then \n",
|
||||
"# exported. You can filter by date and hashtag, for \n",
|
||||
"# public and local timeline.\n",
|
||||
"#######################################################\n",
|
||||
"# There are three analytical sections in this notebook\n",
|
||||
"# for each timeline.\n",
|
||||
"# 1. Descriptive overview of the material\n",
|
||||
"# 2. Co-Hashtags\n",
|
||||
"# 3. Federation\n",
|
||||
"#######################################################\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"import pandas as pd\n",
|
||||
"from glob import glob\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e8bc49e9-341e-4507-9221-a1bf590921e5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data for 'timeline_public'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "de8f017c-3d2c-4c97-a9bc-c586aebeb20c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Loading data: Public data is loaded to variable 'df_public'.\n",
|
||||
"# Given path must lead directly to json files from the collection and end with '/*'\n",
|
||||
"# E.g. 'YOUR/PATH/timeline_public/*' \n",
|
||||
"path_public = \n",
|
||||
"\n",
|
||||
"data_public = []\n",
|
||||
"for fi in glob(path_public):\n",
|
||||
" if os.path.isfile(fi) and fi.endswith('.json'):\n",
|
||||
" with open(fi, 'r') as infile:\n",
|
||||
" data_public += json.load(infile)\n",
|
||||
" \n",
|
||||
"df_public = pd.json_normalize(data_public)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e64f6782-35a9-406d-9cca-40ac40f35127",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Stats"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2de303b6-25d7-4212-9580-54cbd9c43cfe",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add average posts per MAU \n",
|
||||
"print('Posts in Total:', len(df_public)) \n",
|
||||
"print('Amount of individual users:',len(pd.unique(df_public['account.username'])))\n",
|
||||
"print('Average posts per user:',len(df_public)/len(pd.unique(df_public['account.username'])))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e8e561d-1783-4cd5-accd-1a8f3de6c973",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Show amount of toots sorted by (calendar) week\n",
|
||||
"# Note: Week twenty three is not fully part of the data set. Therefore that week is incomplete. Keep that in mind during analysis.\n",
|
||||
"df_public_dates = df_public[\"created_at\"].astype(\"datetime64\")\n",
|
||||
"df_public_dates.groupby(df_public_dates.dt.isocalendar().week).count().plot(kind=\"bar\")\n",
|
||||
"\n",
|
||||
"## Keep in mind, week 23 was not fully archived. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b54a52b0-da74-4119-a08b-a6b35bb1bb6d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Postings distributed by weekdays \n",
|
||||
"df_public_dates.groupby(df_public_dates.dt.isocalendar().day).count().plot(kind=\"bar\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3c9c0fe9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1fd7e75d-83e3-4073-b39f-16b93b82a694",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Date specific filtering\n",
|
||||
"############################\n",
|
||||
"# Please insert the time frame by \n",
|
||||
"# specifying a start date and an end date \n",
|
||||
"\n",
|
||||
"#start_date = \n",
|
||||
"#end_date = \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#mask = (df_public['created_at'] > start_date) & (df_public['created_at'] <= end_date)\n",
|
||||
"\n",
|
||||
"# Uncommment below to overwrite existing data frame\n",
|
||||
"\n",
|
||||
"#df_public = df_public.loc[mask]\n",
|
||||
"\n",
|
||||
"# Uncomment below to create subset called 'filtered_df'\n",
|
||||
"#filtered_df = df_public.loc[mask]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8497e393-ac7f-45e8-824e-3cbc874fb2e2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Hashtag analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d299dd8a-ca4c-4069-94d7-b4dfd5540897",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. \n",
|
||||
"# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. \n",
|
||||
"# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle \n",
|
||||
"# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').\n",
|
||||
"\n",
|
||||
"tags_public = []\n",
|
||||
"for row in df_public.iterrows():\n",
|
||||
" for tag in row[1]['tags']:\n",
|
||||
" tag['toot_id'] = row[1]['id']\n",
|
||||
" tag['toot_created_at'] = row[1]['created_at']\n",
|
||||
" tag['toot_account.id'] = row[1]['account.id'] \n",
|
||||
" tags_public.append(tag)\n",
|
||||
"\n",
|
||||
"tags_public = pd.json_normalize(tags_public)\n",
|
||||
"# For a first look\n",
|
||||
"tags_public.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7c9d406d-458e-4cc4-b6b7-85434e66dc98",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Hashtags sorted by frequency\n",
|
||||
"print(tags_public.groupby('name').size().sort_values(ascending=False))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2345142d-b9a2-4011-822e-81dca9000034",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Show Top 10 Toots as bar plot\n",
|
||||
"#find values with top 10 occurrences in 'my_column'\n",
|
||||
"top_10 = (tags_public['name'].value_counts()).iloc[:10]\n",
|
||||
"top_10.plot(kind='bar')\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "80d37aab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Show top 25 Users by amount of posts\n",
|
||||
"\n",
|
||||
"top_25_users = (df_public['account.username'].value_counts()).iloc[:25]\n",
|
||||
"top_25_users.plot(kind='bar')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "262aa947-9b5c-4550-8b36-b09ed81a2ae7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Extract toots containing a certain hashtag\n",
|
||||
"# Insert hashtags tags to import in a seperate data set\n",
|
||||
"# Hashtag name must be inserted within quotes!\n",
|
||||
"\n",
|
||||
"my_tags_public = []\n",
|
||||
"\n",
|
||||
"df_my_tags_public = tags_public[tags_public['name'].isin(my_tags_public)]\n",
|
||||
"\n",
|
||||
"df_my_tags_public.head() \n",
|
||||
"\n",
|
||||
"# uncomment the following command to create a csv-file for further examination\n",
|
||||
"\n",
|
||||
"#df_my_tags_public.to_csv('exports/my_tags_public.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "adab07a1-1ba4-407a-8151-cc890fe6603a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data for 'timeline_local'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3131ef1f-6f8d-4dcb-a801-553019a79e90",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Loading data: Public data is loaded to variable 'df_public'.\n",
|
||||
"# ATTENTION: In this environment relative paths don't work with glob. \n",
|
||||
"path_local = '/var/jupyter-data/jupyter-mastodata/data/mastodon.social/timeline_local/*'\n",
|
||||
"\n",
|
||||
"data_local = []\n",
|
||||
"for fi in glob(path_local):\n",
|
||||
" if os.path.isfile(fi) and fi.endswith('.json'):\n",
|
||||
" with open(fi, 'r') as infile:\n",
|
||||
" data_local += json.load(infile)\n",
|
||||
" \n",
|
||||
"df_local = pd.json_normalize(data_local)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "10a6cafb-861e-4c8b-bc1f-e0fd5f67e7ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Stats"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5133131-1a74-4c61-8f3c-4044612a0394",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add average posts per MAU\n",
|
||||
"print('Posts in Total:', len(df_local)) \n",
|
||||
"print('Amount of individual users:',len(pd.unique(df_local['account.username'])))\n",
|
||||
"print('Average posts per user:',len(df_local)/len(pd.unique(df_local['account.username'])))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3c20029-0d0a-48e8-9632-5bab4b21d99b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Change datetype of 'created_at'\n",
|
||||
"df_local_dates = df_local[\"created_at\"].astype(\"datetime64\")\n",
|
||||
"# Show amount of toots sorted by (calendar) week\n",
|
||||
"df_local_dates.groupby(df_local_dates.dt.isocalendar().week).count().plot(kind=\"bar\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dfbd773c-7cbc-4fe8-8343-934fe747faa2",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Postings distributed by weekdays\n",
|
||||
"df_local_dates.groupby(df_local_dates.dt.isocalendar().day).count().plot(kind=\"bar\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5b99b490",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Show top 25 Users by amount of posts\n",
|
||||
"\n",
|
||||
"top_25_users = (df_local['account.username'].value_counts()).iloc[:25]\n",
|
||||
"top_25_users.plot(kind='bar')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f24fccb8-0cb6-4241-89a8-36cd714e235e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Hashtag analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3f0c3c70-aeb5-4359-9319-f3467a8b7162",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. \n",
|
||||
"# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. \n",
|
||||
"# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle \n",
|
||||
"# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').\n",
|
||||
"\n",
|
||||
"tags_local = []\n",
|
||||
"for row in df_local.iterrows():\n",
|
||||
" for tag in row[1]['tags']:\n",
|
||||
" tag['toot_id'] = row[1]['id']\n",
|
||||
" tag['toot_created_at'] = row[1]['created_at']\n",
|
||||
" tag['toot_account.id'] = row[1]['account.id'] \n",
|
||||
" tags_local.append(tag)\n",
|
||||
"\n",
|
||||
"tags_local = pd.json_normalize(tags_local)\n",
|
||||
"tags_local.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "561054f2-d8db-47d9-aa15-c0ea5f06388d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Hashtags sorted by frequency\n",
|
||||
"tags_local.groupby('name').size().sort_values(ascending=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "69bba3a4-03b0-4cd1-8172-3418ad06b446",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Show Top 10 Toots as bar plot\n",
|
||||
"#find values with top 10 occurrences in 'my_column'\n",
|
||||
"top_10_tags = (tags_local['name'].value_counts()).iloc[:10]\n",
|
||||
"top_10_tags.plot(kind='bar')\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "90785572",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "54ed0e60-9349-4cf1-b987-04aa3aa07037",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Extract toots containing a certain hashtag\n",
|
||||
"# Insert hashtags tags to import in a seperate data set\n",
|
||||
"# Hashtag name must be inserted within quotes!\n",
|
||||
"\n",
|
||||
"my_tags_local = []\n",
|
||||
"\n",
|
||||
"df_my_tags_local = tags_local[tags_local['name'].isin(my_tags_local)]\n",
|
||||
"\n",
|
||||
"df_my_tags_local.head() \n",
|
||||
"\n",
|
||||
"# uncomment the following command to create a csv-file for further examination\n",
|
||||
"\n",
|
||||
"#df_my_tags_local.to_csv('exports/my_tags_local.csv')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user