{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "6ad03636-9c4b-48f1-b85d-6ae8527d9984", "metadata": { "tags": [] }, "outputs": [], "source": [ "###################################################\n", "#___ ___ _ _ _ #\n", "#| \\/ | | | | | | | #\n", "#| . . | __ _ ___| |_ ___ __| | __ _| |_ __ _ #\n", "#| |\\/| |/ _` / __| __/ _ \\ / _` |/ _` | __/ _` | #\n", "#| | | | (_| \\__ \\ || (_) | (_| | (_| | || (_| | #\n", "#\\_| |_/\\__,_|___/\\__\\___/ \\__,_|\\__,_|\\__\\__,_| #\n", "################################################### \n", "# This script loads data from the public and local tag timeline.\n", "# Please fill in the correct path, where your data \n", "# is stored.\n", "#######################################################\n", "# At the current stage, mastodata data loaders are meant\n", "# to briefly explore collected data, filtered and then \n", "# exported. You can filter by date and hashtag, for \n", "# public and local timeline.\n", "#######################################################\n", "# There are three analytical sections in this notebook\n", "# for each timeline.\n", "# 1. Descriptive overview of the material\n", "# 2. Co-Hashtags\n", "# 3. Federation\n", "#######################################################\n", "\n", "import os\n", "import json\n", "import pandas as pd\n", "from glob import glob\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "id": "e8bc49e9-341e-4507-9221-a1bf590921e5", "metadata": {}, "source": [ "## Data for 'timeline_public'" ] }, { "cell_type": "code", "execution_count": null, "id": "de8f017c-3d2c-4c97-a9bc-c586aebeb20c", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Loading data: Public data is loaded to variable 'df_public'.\n", "# Given path must lead directly to json files from the collection and end with '/*'\n", "# E.g. 'YOUR/PATH/timeline_public/*' \n", "path_public = \n", "\n", "data_public = []\n", "for fi in glob(path_public):\n", " if os.path.isfile(fi) and fi.endswith('.json'):\n", " with open(fi, 'r') as infile:\n", " data_public += json.load(infile)\n", " \n", "df_public = pd.json_normalize(data_public)" ] }, { "cell_type": "markdown", "id": "e64f6782-35a9-406d-9cca-40ac40f35127", "metadata": {}, "source": [ "## Stats" ] }, { "cell_type": "code", "execution_count": null, "id": "2de303b6-25d7-4212-9580-54cbd9c43cfe", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Add average posts per MAU \n", "print('Posts in Total:', len(df_public)) \n", "print('Amount of individual users:',len(pd.unique(df_public['account.username'])))\n", "print('Average posts per user:',len(df_public)/len(pd.unique(df_public['account.username'])))" ] }, { "cell_type": "code", "execution_count": null, "id": "9e8e561d-1783-4cd5-accd-1a8f3de6c973", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Show amount of toots sorted by (calendar) week\n", "# Note: Week twenty three is not fully part of the data set. Therefore that week is incomplete. Keep that in mind during analysis.\n", "df_public_dates = df_public[\"created_at\"].astype(\"datetime64\")\n", "df_public_dates.groupby(df_public_dates.dt.isocalendar().week).count().plot(kind=\"bar\")\n", "\n", "## Keep in mind, week 23 was not fully archived. " ] }, { "cell_type": "code", "execution_count": null, "id": "b54a52b0-da74-4119-a08b-a6b35bb1bb6d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Postings distributed by weekdays \n", "df_public_dates.groupby(df_public_dates.dt.isocalendar().day).count().plot(kind=\"bar\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3c9c0fe9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1fd7e75d-83e3-4073-b39f-16b93b82a694", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Date specific filtering\n", "############################\n", "# Please insert the time frame by \n", "# specifying a start date and an end date \n", "\n", "#start_date = \n", "#end_date = \n", "\n", "\n", "#mask = (df_public['created_at'] > start_date) & (df_public['created_at'] <= end_date)\n", "\n", "# Uncommment below to overwrite existing data frame\n", "\n", "#df_public = df_public.loc[mask]\n", "\n", "# Uncomment below to create subset called 'filtered_df'\n", "#filtered_df = df_public.loc[mask]" ] }, { "cell_type": "markdown", "id": "8497e393-ac7f-45e8-824e-3cbc874fb2e2", "metadata": {}, "source": [ "## Hashtag analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "d299dd8a-ca4c-4069-94d7-b4dfd5540897", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. \n", "# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. \n", "# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle \n", "# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').\n", "\n", "tags_public = []\n", "for row in df_public.iterrows():\n", " for tag in row[1]['tags']:\n", " tag['toot_id'] = row[1]['id']\n", " tag['toot_created_at'] = row[1]['created_at']\n", " tag['toot_account.id'] = row[1]['account.id'] \n", " tags_public.append(tag)\n", "\n", "tags_public = pd.json_normalize(tags_public)\n", "# For a first look\n", "tags_public.head(5)" ] }, { "cell_type": "code", "execution_count": null, "id": "7c9d406d-458e-4cc4-b6b7-85434e66dc98", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Hashtags sorted by frequency\n", "print(tags_public.groupby('name').size().sort_values(ascending=False))" ] }, { "cell_type": "code", "execution_count": null, "id": "2345142d-b9a2-4011-822e-81dca9000034", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Show Top 10 Toots as bar plot\n", "#find values with top 10 occurrences in 'my_column'\n", "top_10 = (tags_public['name'].value_counts()).iloc[:10]\n", "top_10.plot(kind='bar')\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "80d37aab", "metadata": {}, "outputs": [], "source": [ "# Show top 25 Users by amount of posts\n", "\n", "top_25_users = (df_public['account.username'].value_counts()).iloc[:25]\n", "top_25_users.plot(kind='bar')" ] }, { "cell_type": "code", "execution_count": null, "id": "262aa947-9b5c-4550-8b36-b09ed81a2ae7", "metadata": {}, "outputs": [], "source": [ "# Extract toots containing a certain hashtag\n", "# Insert hashtags tags to import in a seperate data set\n", "# Hashtag name must be inserted within quotes!\n", "\n", "my_tags_public = []\n", "\n", "df_my_tags_public = tags_public[tags_public['name'].isin(my_tags_public)]\n", "\n", "df_my_tags_public.head() \n", "\n", "# uncomment the following command to create a csv-file for further examination\n", "\n", "#df_my_tags_public.to_csv('exports/my_tags_public.csv')" ] }, { "cell_type": "markdown", "id": "adab07a1-1ba4-407a-8151-cc890fe6603a", "metadata": {}, "source": [ "## Data for 'timeline_local'" ] }, { "cell_type": "code", "execution_count": null, "id": "3131ef1f-6f8d-4dcb-a801-553019a79e90", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Loading data: Public data is loaded to variable 'df_public'.\n", "# ATTENTION: In this environment relative paths don't work with glob. \n", "path_local = '/var/jupyter-data/jupyter-mastodata/data/mastodon.social/timeline_local/*'\n", "\n", "data_local = []\n", "for fi in glob(path_local):\n", " if os.path.isfile(fi) and fi.endswith('.json'):\n", " with open(fi, 'r') as infile:\n", " data_local += json.load(infile)\n", " \n", "df_local = pd.json_normalize(data_local)" ] }, { "cell_type": "markdown", "id": "10a6cafb-861e-4c8b-bc1f-e0fd5f67e7ce", "metadata": {}, "source": [ "## Stats" ] }, { "cell_type": "code", "execution_count": null, "id": "c5133131-1a74-4c61-8f3c-4044612a0394", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Add average posts per MAU\n", "print('Posts in Total:', len(df_local)) \n", "print('Amount of individual users:',len(pd.unique(df_local['account.username'])))\n", "print('Average posts per user:',len(df_local)/len(pd.unique(df_local['account.username'])))" ] }, { "cell_type": "code", "execution_count": null, "id": "c3c20029-0d0a-48e8-9632-5bab4b21d99b", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Change datetype of 'created_at'\n", "df_local_dates = df_local[\"created_at\"].astype(\"datetime64\")\n", "# Show amount of toots sorted by (calendar) week\n", "df_local_dates.groupby(df_local_dates.dt.isocalendar().week).count().plot(kind=\"bar\")" ] }, { "cell_type": "code", "execution_count": null, "id": "dfbd773c-7cbc-4fe8-8343-934fe747faa2", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Postings distributed by weekdays\n", "df_local_dates.groupby(df_local_dates.dt.isocalendar().day).count().plot(kind=\"bar\")" ] }, { "cell_type": "code", "execution_count": null, "id": "5b99b490", "metadata": {}, "outputs": [], "source": [ "# Show top 25 Users by amount of posts\n", "\n", "top_25_users = (df_local['account.username'].value_counts()).iloc[:25]\n", "top_25_users.plot(kind='bar')" ] }, { "cell_type": "markdown", "id": "f24fccb8-0cb6-4241-89a8-36cd714e235e", "metadata": {}, "source": [ "## Hashtag analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "3f0c3c70-aeb5-4359-9319-f3467a8b7162", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Um die Tags zu analysieren macht es Sinn eine Eigene Tabelle zu erstellen, die sämtliche Tags und Informationen zu diesen enthält. \n", "# Hier ist ein Beispiel um eine solche Tabelle zu erstellen. Neben dem Tag wird die Toot ID, das Toot created_at sowie der account in jeder Zeile mitgeführt. \n", "# Das kann man je nach analytischem Interesse erweitern. Für Datensparsamkeit beim speichern würde die Toot ID reichen. Den rest kann man mit der anderen Tabelle \n", "# auch joinen mit pd.merge(df, tags, left_on='id', right_on='toot_id').\n", "\n", "tags_local = []\n", "for row in df_local.iterrows():\n", " for tag in row[1]['tags']:\n", " tag['toot_id'] = row[1]['id']\n", " tag['toot_created_at'] = row[1]['created_at']\n", " tag['toot_account.id'] = row[1]['account.id'] \n", " tags_local.append(tag)\n", "\n", "tags_local = pd.json_normalize(tags_local)\n", "tags_local.head(5)" ] }, { "cell_type": "code", "execution_count": null, "id": "561054f2-d8db-47d9-aa15-c0ea5f06388d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Hashtags sorted by frequency\n", "tags_local.groupby('name').size().sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "69bba3a4-03b0-4cd1-8172-3418ad06b446", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Show Top 10 Toots as bar plot\n", "#find values with top 10 occurrences in 'my_column'\n", "top_10_tags = (tags_local['name'].value_counts()).iloc[:10]\n", "top_10_tags.plot(kind='bar')\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "90785572", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "54ed0e60-9349-4cf1-b987-04aa3aa07037", "metadata": {}, "outputs": [], "source": [ "# Extract toots containing a certain hashtag\n", "# Insert hashtags tags to import in a seperate data set\n", "# Hashtag name must be inserted within quotes!\n", "\n", "my_tags_local = []\n", "\n", "df_my_tags_local = tags_local[tags_local['name'].isin(my_tags_local)]\n", "\n", "df_my_tags_local.head() \n", "\n", "# uncomment the following command to create a csv-file for further examination\n", "\n", "#df_my_tags_local.to_csv('exports/my_tags_local.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }