Wikimedia Attention and Hype Decay

Executed Notebook

This notebook uses Wikimedia Analytics API pageviews to measure public attention cycles. Pageviews are interpreted as public-attention signals.

Data sources are recorded in the source audit table.

In [1]
from pathlib import Path
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from examples.hot_trends.data import (
    HotTrendDataError,
    append_real_snapshot,
    build_arxiv_monthly_counts,
    fetch_coingecko_market_chart,
    fetch_defillama_stablecoin_chains,
    fetch_github_repo_metadata,
    fetch_github_stargazers,
    fetch_huggingface_models,
    fetch_wikipedia_pageviews,
    source_audit_table,
)
from examples.hot_trends.decomposition import (
    component_summary,
    decompose_table,
    editorial_priority,
    residual_event_table,
)
from examples.hot_trends.scoring import article_publication_phrasing

pd.set_option("display.max_columns", 80)
pd.set_option("display.max_rows", 80)
plt.rcParams.update({"axes.grid": True})

CACHE_DIR = Path("examples/hot_trends/cache")
OUTPUT_DIR = Path("examples/hot_trends/outputs")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def save_table(df, name):
    path = OUTPUT_DIR / f"{name}.csv"
    df.to_csv(path, index=False)
    print(f"saved: {path.as_posix()}")

1. Select pages and date window

In [2]
articles = [
    "Artificial intelligence",
    "Large language model",
    "ChatGPT",
    "Bitcoin",
    "Nvidia",
    "OpenAI",
]
START = "2025-01-01"
END = "2026-05-20"
pd.DataFrame({"article": articles})

2. Fetch pageviews

In [3]
frames = []
for article in articles:
    frames.append(fetch_wikipedia_pageviews(article, start=START, end=END))
views = pd.concat(frames, ignore_index=True)
views.head(20)

3. Audit the pageview table

In [4]
audit = source_audit_table(views, value_col="views", entity_col="article", time_col="date")
audit

4. Decompose pageview attention

In [5]
components = decompose_table(views, entity_col="article", time_col="date", value_col="views", method="MA_BASELINE", period=7, trend_window=21, transform="log1p")
summary = editorial_priority(component_summary(components, entity_col="article", time_col="date"), entity_col="article")
summary

Visualization: Wikimedia attention components

Top article panels reveal the transformed pageview trend and residual attention shocks.

In [6]
top_articles = summary["article"].head(3).tolist()
fig, axes = plt.subplots(len(top_articles), 2, figsize=(11, max(3.0, 2.6 * len(top_articles))), squeeze=False)
for row, article in enumerate(top_articles):
    panel = components.loc[components["article"].eq(article)].sort_values("date").copy()
    panel["date"] = pd.to_datetime(panel["date"])
    axes[row, 0].plot(panel["date"], panel["observed"], label="observed", linewidth=1.6)
    axes[row, 0].plot(panel["date"], panel["trend"], label="trend", linewidth=1.8)
    axes[row, 0].set_title(article)
    axes[row, 1].bar(panel["date"], panel["residual"], color=np.where(panel["residual"] >= 0, "tab:red", "tab:blue"), width=1.0)
    axes[row, 1].set_title("residual")
axes[0, 0].legend(loc="best")
plt.tight_layout()
plt.show()

5. Residual shock events

In [7]
events = residual_event_table(components, entity_col="article", time_col="date", top_n=25)
events

6. Hype-decay table

A simple decay proxy: after each article's largest residual event, count days until residual drops below half of that peak.

In [8]
decay_rows = []
for article, sub in components.groupby("article"):
    sub = sub.sort_values("date").copy()
    rz = (sub["residual"] - sub["residual"].median()).abs()
    peak_idx = int(rz.idxmax())
    peak_date = pd.to_datetime(sub.loc[peak_idx, "date"])
    peak = float(rz.loc[peak_idx])
    after = sub.loc[peak_idx:].copy()
    after_rz = (after["residual"] - sub["residual"].median()).abs()
    below = after.loc[after_rz <= 0.5 * peak]
    half_life_days = None if below.empty else int((pd.to_datetime(below["date"].iloc[0]) - peak_date).days)
    decay_rows.append({"article": article, "peak_date": str(peak_date.date()), "peak_residual_abs": peak, "attention_half_life_days": half_life_days})
decay = pd.DataFrame(decay_rows).sort_values("peak_residual_abs", ascending=False)
decay

Visualization: hype decay half-life

The bar chart exposes which attention spikes decayed quickly and which stayed elevated or remain unresolved.

In [9]
decay_plot = decay.copy()
decay_plot["attention_half_life_days"] = pd.to_numeric(decay_plot["attention_half_life_days"], errors="coerce")
decay_plot = decay_plot.sort_values("peak_residual_abs")
fig, axes = plt.subplots(1, 2, figsize=(12, 4.2))
decay_plot.plot(kind="barh", x="article", y="peak_residual_abs", ax=axes[0], color="tab:red", legend=False, title="Largest residual spike")
decay_plot.assign(attention_half_life_days=decay_plot["attention_half_life_days"].fillna(0)).plot(kind="barh", x="article", y="attention_half_life_days", ax=axes[1], color="tab:blue", legend=False, title="Days to half residual")
axes[0].set_ylabel("")
axes[1].set_ylabel("")
plt.tight_layout()
plt.show()
In [10]
save_table(audit, "05_wikipedia_attention_audit")
save_table(summary, "05_wikipedia_attention_summary")
save_table(events, "05_wikipedia_attention_events")
save_table(decay, "05_wikipedia_hype_decay")
save_table(article_publication_phrasing(), "05_wikipedia_publication_phrasing")