arXiv Category Pulse

Executed Notebook

This notebook counts real arXiv submissions by category and decomposes the resulting monthly count series.

Use it for articles such as:

  • Which AI fields are actually accelerating?
  • Is cs.CL still rising after the LLM boom?
  • Are biology and finance categories showing durable AI-adjacent growth?

Data source: arXiv API.

In [1]
from pathlib import Path
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from examples.hot_trends.data import (
    HotTrendDataError,
    append_real_snapshot,
    build_arxiv_monthly_counts,
    fetch_coingecko_market_chart,
    fetch_defillama_stablecoin_chains,
    fetch_github_repo_metadata,
    fetch_github_stargazers,
    fetch_huggingface_models,
    fetch_wikipedia_pageviews,
    source_audit_table,
)
from examples.hot_trends.decomposition import (
    component_summary,
    decompose_table,
    editorial_priority,
    residual_event_table,
)
from examples.hot_trends.scoring import article_publication_phrasing

pd.set_option("display.max_columns", 80)
pd.set_option("display.max_rows", 80)
plt.rcParams.update({"axes.grid": True})

CACHE_DIR = Path("examples/hot_trends/cache")
OUTPUT_DIR = Path("examples/hot_trends/outputs")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def save_table(df, name):
    path = OUTPUT_DIR / f"{name}.csv"
    df.to_csv(path, index=False)
    print(f"saved: {path.as_posix()}")

1. Define the category watchlist

In [2]
queries = {
    "cs.AI": "cat:cs.AI",
    "cs.LG": "cat:cs.LG",
    "cs.CL": "cat:cs.CL",
    "cs.CV": "cat:cs.CV",
    "stat.ML": "cat:stat.ML",
    "q-bio.QM": "cat:q-bio.QM",
    "q-fin.ST": "cat:q-fin.ST",
    "econ.EM": "cat:econ.EM",
}
watchlist = pd.DataFrame([{"series": k, "query": v} for k, v in queries.items()])
watchlist

2. Fetch monthly counts

The default window is intentionally modest so the notebook can run without hammering the API. Increase the window for a full article.

In [3]
START_MONTH = "2025-01-01"
END_MONTH = "2026-05-01"
SLEEP_SECONDS = 3.0

cache_path = CACHE_DIR / f"arxiv_category_counts_{START_MONTH}_{END_MONTH}.csv"
if cache_path.exists():
    counts = pd.read_csv(cache_path, parse_dates=["month"])
else:
    counts = build_arxiv_monthly_counts(queries, start_month=START_MONTH, end_month=END_MONTH, sleep_seconds=SLEEP_SECONDS)
    counts.to_csv(cache_path, index=False)
counts.head(16)

3. Audit the source table

In [4]
audit = source_audit_table(counts, value_col="count", entity_col="series", time_col="month")
audit

4. Decompose counts with DeTime

For monthly counts, period 12 is the first cycle to test. The method is MA_BASELINE by default because it is built into DeTime and keeps the tutorial dependency-light.

In [5]
components = decompose_table(
    counts,
    entity_col="series",
    time_col="month",
    value_col="count",
    method="MA_BASELINE",
    period=12,
    trend_window=5,
    transform="log1p",
)
components.head(16)

5. Rank fields by trend and residual shock

In [6]
summary = component_summary(components, entity_col="series", time_col="month")
priority = editorial_priority(summary, entity_col="series")
priority

Visualization: arXiv category trend shock scatter

The scatter compares trend slope, shock magnitude, and cycle strength in the editorial priority table.

In [7]
scatter_frame = priority.dropna(subset=["trend_slope_per_step", "max_abs_residual_z"]).copy()
fig, ax = plt.subplots(figsize=(7.5, 4.8))
sizes = 80 + scatter_frame["cycle_strength_proxy"].fillna(0).clip(lower=0) * 260
sc = ax.scatter(
    scatter_frame["trend_slope_per_step"],
    scatter_frame["max_abs_residual_z"],
    s=sizes,
    c=scatter_frame["editorial_priority_score"],
    cmap="viridis",
)
for _, row in scatter_frame.head(8).iterrows():
    ax.annotate(str(row["series"]), (row["trend_slope_per_step"], row["max_abs_residual_z"]), fontsize=8, xytext=(4, 4), textcoords="offset points")
ax.axvline(0, color="0.45", linewidth=0.8)
ax.set_xlabel("trend slope per step")
ax.set_ylabel("max absolute residual z")
ax.set_title("Trend slope versus residual shock")
fig.colorbar(sc, ax=ax, label="editorial priority score")
plt.tight_layout()
plt.show()

Visualization: arXiv category component panels

Observed and trend lines plus residual bars turn the component table into an inspectable trend narrative.

In [8]
top_entities = priority["series"].head(4).tolist()
fig, axes = plt.subplots(len(top_entities), 2, figsize=(11, max(3.0, 2.4 * len(top_entities))), squeeze=False)
for row, entity in enumerate(top_entities):
    panel = components.loc[components["series"].eq(entity)].sort_values("month").copy()
    panel["month"] = pd.to_datetime(panel["month"])
    axes[row, 0].plot(panel["month"], panel["observed"], label="observed", linewidth=1.6)
    axes[row, 0].plot(panel["month"], panel["trend"], label="trend", linewidth=1.8)
    axes[row, 0].set_title(str(entity))
    axes[row, 1].bar(panel["month"], panel["residual"], color=np.where(panel["residual"] >= 0, "tab:red", "tab:blue"), width=20)
    axes[row, 1].set_title("residual")
    axes[row, 0].set_ylabel("transformed count")
    axes[row, 1].set_ylabel("residual")
axes[0, 0].legend(loc="best")
plt.suptitle("arXiv category observed/trend/residual panels", y=1.01)
plt.tight_layout()
plt.show()

6. Top residual events

These rows are article hooks. Causal interpretation requires additional evidence; the rows mark months that do not fit the smooth trend/cycle baseline.

In [9]
events = residual_event_table(components, entity_col="series", time_col="month", top_n=20)
events

Visualization: arXiv category residual heatmap

The heatmap shows where residual shocks cluster across entities and time.

In [10]
residual_grid = components.copy()
residual_grid["residual_z"] = residual_grid.groupby("series")["residual"].transform(lambda s: (s - s.median()) / (1.4826 * (s - s.median()).abs().median() + 1e-12))
heat = residual_grid.pivot_table(index="series", columns="month", values="residual_z", aggfunc="mean")
heat = heat.reindex(priority["series"].tolist()).dropna(how="all")
values = heat.to_numpy(dtype=float)
absmax = float(np.nanmax(np.abs(values))) if np.isfinite(values).any() else 1.0
fig, ax = plt.subplots(figsize=(11, 4.5))
im = ax.imshow(values, aspect="auto", cmap="RdBu_r", vmin=-absmax, vmax=absmax)
ax.set_yticks(range(len(heat.index)))
ax.set_yticklabels(heat.index)
tick_step = max(1, len(heat.columns) // 8)
xticks = list(range(0, len(heat.columns), tick_step))
ax.set_xticks(xticks)
ax.set_xticklabels([pd.to_datetime(heat.columns[i]).strftime("%Y-%m") for i in xticks], rotation=45, ha="right")
ax.set_title("arXiv category residual z-score heatmap")
fig.colorbar(im, ax=ax, label="robust residual z")
plt.tight_layout()
plt.show()

7. Article-ready language

In [11]
phrasing = article_publication_phrasing()
phrasing
In [12]
save_table(watchlist, "01_arxiv_category_watchlist")
save_table(audit, "01_arxiv_category_audit")
save_table(priority, "01_arxiv_category_priority")
save_table(events, "01_arxiv_category_residual_events")
save_table(phrasing, "01_arxiv_category_publication_phrasing")