News Sentiment Index

This notebook builds a daily News Sentiment Index (NSI) from GDELT GKG v2 news coverage for a small set of themes: oil, crypto, cybersecurity, and semiconductors.

The pipeline downloads 15-minute GDELT tiles, filters articles by topic relevance, extracts article-level tone, and aggregates the result into a daily topic level sentiment series. Each topic is saved as a local CSV file, and the notebook also produces a combined summary table and topic charts.

Topics covered

oil
crypto
cybersecurity
semiconductors

Outputs

per-topic CSV files saved locally
one combined CSV file across all topics
daily NSI and 7-day moving average plots

Configuration

The pipeline processes all 15-minute tiles between the start and end dates, scores article relevance by topic, and keeps a limited number of top articles per topic per day before computing the daily NSI.

In [1]:

START_DAY_UTC   = "2026-03-15"
END_DAY_UTC     = "2026-04-01"   # inclusive
TOPICS          = ["oil", "crypto", "cybersecurity", "semiconductors"]
MAX_WORKERS     = 48
DAILY_LIMIT     = 100            # max articles per topic per day
MIN_TITLE_WORDS = 4
OUTPUT_DIR      = "nsi_output"   # local directory for CSV files
PROGRESS_EVERY  = 96             # print progress every N tiles (~1 day)

Imports

In [2]:

import io
import re
import html
import time
import zipfile
import threading
from pathlib import Path
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

Path(OUTPUT_DIR).mkdir(exist_ok=True)
#print(f"Output directory: {Path(OUTPUT_DIR).resolve()}")

Topic configurations

Each topic is defined with three layers of matching logic:

anchor terms that identify whether an article is relevant at all
positive terms that contribute to a topic relevance score
positive phrases that receive additional weight when they appear exactly

These configurations are used to decide whether an article belongs to a topic and how strongly it should be ranked relative to other candidate articles from the same day.

In [3]:

TOPIC_CONFIGS = {
    "oil": {
        "anchors": [
            "oil", "crude", "brent", "wti", "opec", "opec+", "eia", "iea",
            "spr", "strategic petroleum reserve", "refinery", "refineries",
            "diesel", "gasoline", "distillate", "inventory", "inventories"
        ],
        "positive_terms": {
            "oil": 2.0, "crude": 2.2, "brent": 2.0, "wti": 2.0,
            "opec": 2.4, "opec+": 2.6, "eia": 2.4, "iea": 2.3,
            "inventory": 1.8, "inventories": 1.8, "refinery": 1.5,
            "refineries": 1.5, "spr": 2.0, "sanctions": 1.5,
            "exports": 1.2, "production": 1.6, "output": 1.4,
            "demand": 1.2, "supply": 1.2, "outage": 1.8,
            "shutdown": 1.7, "attack": 1.5, "pipeline": 1.4, "terminal": 1.3
        },
        "positive_phrases": {
            "oil prices": 1.5, "crude oil": 2.0, "eia inventories": 3.5,
            "opec+": 3.0, "strategic petroleum reserve": 3.0,
            "oil demand": 2.2, "oil supply": 2.2
        },
        "negative_terms": {
            "olive": -5.0, "essential oil": -6.0, "fish oil": -6.0,
            "cooking oil": -6.0, "beauty": -3.0, "cosmetic": -3.0
        },
        "min_score": 3.0
    },
    "crypto": {
        "anchors": [
            "crypto", "cryptocurrency", "cryptocurrencies", "bitcoin", "btc",
            "ethereum", "ether", "eth", "stablecoin", "stablecoins", "token",
            "tokens", "blockchain", "binance", "coinbase", "sec crypto",
            "bitcoin etf", "crypto etf"
        ],
        "positive_terms": {
            "crypto": 2.0, "cryptocurrency": 2.2, "cryptocurrencies": 2.2,
            "bitcoin": 2.8, "btc": 2.4, "ethereum": 2.6, "ether": 2.2,
            "eth": 2.0, "stablecoin": 2.4, "stablecoins": 2.4,
            "token": 1.4, "tokens": 1.4, "blockchain": 1.6,
            "binance": 1.8, "coinbase": 1.8, "etf": 1.2,
            "exchange": 1.1, "sec": 1.0, "regulation": 1.2,
            "hack": 1.2, "custody": 1.1
        },
        "positive_phrases": {
            "bitcoin etf": 3.2, "crypto etf": 2.8, "crypto exchange": 2.0,
            "digital asset": 2.0, "digital assets": 2.0,
            "crypto regulation": 2.6, "bitcoin treasury": 2.0
        },
        "negative_terms": {
            "cryptography": -5.0, "encryption": -2.5,
            "decrypt": -3.0, "cipher": -3.0
        },
        "min_score": 3.0
    },
    "cybersecurity": {
        "anchors": [
            "cyber", "cybersecurity", "cyberattack", "cyberattacks",
            "ransomware", "malware", "data breach", "breach", "breaches",
            "hack", "hacked", "hacking", "ddos", "exploit", "vulnerability",
            "zero-day", "zero day", "phishing", "infosec"
        ],
        "positive_terms": {
            "cyber": 2.0, "cybersecurity": 2.4, "cyberattack": 2.8,
            "cyberattacks": 2.8, "ransomware": 3.0, "malware": 2.5,
            "breach": 2.4, "breaches": 2.4, "hack": 2.0,
            "hacked": 2.0, "hacking": 2.0, "ddos": 2.2,
            "exploit": 1.8, "vulnerability": 1.8, "zero-day": 2.2,
            "phishing": 1.8, "cisa": 1.4, "nsa": 0.8,
            "microsoft": 0.6, "firewall": 1.0
        },
        "positive_phrases": {
            "data breach": 3.2, "critical vulnerability": 2.8,
            "zero day": 3.0, "cyber attack": 3.0, "cyber attacks": 3.0,
            "ransomware attack": 3.2, "security breach": 2.8
        },
        "negative_terms": {
            "cybertruck": -6.0, "video game": -2.5, "gaming": -2.0
        },
        "min_score": 3.0
    },
    "semiconductors": {
        "anchors": [
            "semiconductor", "semiconductors", "chip", "chips", "chipmaker",
            "chipmakers", "foundry", "foundries", "fab", "fabs", "wafer",
            "wafers", "tsmc", "intel", "nvidia", "amd", "asml", "hbm", "dram",
            "memory chip", "memory chips", "lithography"
        ],
        "positive_terms": {
            "semiconductor": 2.8, "semiconductors": 2.8, "chip": 1.8,
            "chips": 1.8, "chipmaker": 2.0, "chipmakers": 2.0,
            "foundry": 2.2, "foundries": 2.2, "fab": 1.8, "fabs": 1.8,
            "wafer": 1.5, "wafers": 1.5, "tsmc": 2.2, "intel": 1.5,
            "nvidia": 1.6, "amd": 1.4, "asml": 1.8, "hbm": 1.8,
            "dram": 1.8, "memory": 1.0, "lithography": 2.0,
            "export controls": 1.8, "capacity": 1.2
        },
        "positive_phrases": {
            "semiconductor industry": 3.0, "memory chip": 2.6,
            "memory chips": 2.6, "chip export": 2.2, "chip exports": 2.2,
            "chip demand": 2.2, "chip supply": 2.2
        },
        "negative_terms": {
            "potato chips": -7.0, "casino chips": -7.0,
            "tortilla chips": -7.0, "fish and chips": -7.0
        },
        "min_score": 3.0
    }
}

In [4]:

GDELT_V2_BASE_URL = "http://data.gdeltproject.org/gdeltv2"
GKG_USECOLS   = [0, 1, 3, 4, 7, 8, 15, 26]
GKG_COL_NAMES = ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier",
                  "Themes", "V2Themes", "V2Tone", "Extras"]
THREAD_LOCAL  = threading.local()

def _compile_topics(configs):
    out = {}
    for tid, cfg in configs.items():
        anchors = [re.escape(str(x).lower()) for x in cfg["anchors"]]
        out[tid] = {
            "anchor_re": re.compile(
                r"(?:" + "|".join(sorted(anchors, key=len, reverse=True)) + r")",
                flags=re.IGNORECASE
            ),
            "phrase_items":   [(str(p).lower(), float(w)) for p, w in cfg["positive_phrases"].items()],
            "positive_items": [(str(k).lower(), float(v)) for k, v in cfg["positive_terms"].items()],
            "negative_items": [(str(k).lower(), float(v)) for k, v in cfg["negative_terms"].items()],
            "min_score":      float(cfg["min_score"]),
        }
    return out

PREPARED = _compile_topics(TOPIC_CONFIGS)

def _utc_midnight(day_str):
    d = datetime.strptime(day_str, "%Y-%m-%d")
    return d.replace(tzinfo=timezone.utc)

def _iter_tiles(start_utc, end_exclusive_utc):
    cur = start_utc.replace(minute=(start_utc.minute // 15) * 15, second=0, microsecond=0)
    while cur < end_exclusive_utc:
        yield cur
        cur += timedelta(minutes=15)

def _session():
    s = getattr(THREAD_LOCAL, "session", None)
    if s:
        return s
    s = requests.Session()
    retry = Retry(total=3, backoff_factor=1,
                  status_forcelist=[429, 500, 502, 503, 504],
                  allowed_methods=["GET"])
    s.mount("http://", HTTPAdapter(max_retries=retry))
    s.mount("https://", HTTPAdapter(max_retries=retry))
    THREAD_LOCAL.session = s
    return s

def _norm_domain(val):
    s = str(val or "").strip().lower()
    if "://" in s:
        try:
            s = urlparse(s).netloc
        except Exception:
            pass
    if ":" in s:
        s = s.split(":", 1)[0]
    return s.lstrip("www.").strip(".")

def _extract_tag(extras, tag):
    m = re.search(fr"<{tag}>(.*?)</{tag}>", str(extras or ""), re.DOTALL)
    return m.group(1).strip() if m else ""

def _parse_ts(raw):
    digits = re.sub(r"\D", "", str(raw or ""))
    if len(digits) >= 14:
        try:
            return pd.to_datetime(digits[:14], format="%Y%m%d%H%M%S", utc=True)
        except Exception:
            pass
    return pd.NaT

def _tone_first(val):
    try:
        return float(str(val or "").split(",", 1)[0])
    except Exception:
        return float("nan")

def _score(text, topic_id):
    prep = PREPARED[topic_id]
    t = str(text or "").lower()
    if not prep["anchor_re"].search(t):
        return None
    score = sum(w for k, w in prep["positive_items"] if k in t)
    score += sum(w * 1.2 for p, w in prep["phrase_items"] if p in t)
    score += sum(w for k, w in prep["negative_items"] if k in t)
    if any(c.isdigit() for c in t):
        score += 0.3
    return float(score) if score >= prep["min_score"] else None

In [5]:

def _process_tile(ts_utc, start_utc, end_excl_utc, topics):
    url = f"{GDELT_V2_BASE_URL}/{ts_utc.strftime('%Y%m%d%H%M%S')}.gkg.csv.zip"
    try:
        resp = _session().get(url, timeout=20)
        if resp.status_code != 200:
            return {}
        with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
            with zf.open(zf.namelist()[0]) as f:
                raw = pd.read_csv(f, sep="\t", header=None, names=GKG_COL_NAMES,
                                  usecols=GKG_USECOLS, low_memory=False, encoding="latin-1")
    except Exception:
        return {}

    if raw is None or raw.empty:
        return {}

    df = raw.copy()
    df["gdelt_ts"] = pd.to_datetime(
        df["DATE"].fillna("").astype(str).str.split(".").str[0],
        format="%Y%m%d%H%M%S", errors="coerce", utc=True
    )
    df = df.dropna(subset=["gdelt_ts"])
    if df.empty:
        return {}

    df["Extras"] = df["Extras"].fillna("").astype(str)
    def _extract_both(extras):
        title = html.unescape(_extract_tag(extras, "PAGE_TITLE")).strip()
        pub   = _extract_tag(extras, "PAGE_PRECISEPUBTIMESTAMP")
        return title, pub
    extracted        = df["Extras"].map(_extract_both)
    df["title"]      = extracted.map(lambda x: x[0])
    df["pub_raw"]    = extracted.map(lambda x: x[1])

    df = df[df["title"].str.len() > 0].copy()
    df = df[df["title"].str.split().str.len() >= MIN_TITLE_WORDS].copy()
    if df.empty:
        return {}

    df["pub_ts"]  = df["pub_raw"].map(_parse_ts)
    df["use_ts"]  = df["pub_ts"].where(df["pub_ts"].notna(), df["gdelt_ts"])
    df["use_ts"]  = pd.to_datetime(df["use_ts"], errors="coerce", utc=True)
    df = df[df["use_ts"].notna()].copy()
    mask = (df["use_ts"] >= start_utc) & (df["use_ts"] < end_excl_utc)
    df = df[mask].copy()
    if df.empty:
        return {}

    df["day_key"] = df["use_ts"].dt.strftime("%Y-%m-%d")
    df["source"]  = df["SourceCommonName"].fillna("").astype(str).map(_norm_domain)
    df["url_norm"]   = df["DocumentIdentifier"].fillna("").astype(str).str.strip().str.lower()
    df["title_norm"] = df["title"].str.strip().str.lower()
    df["V2Tone"]  = df["V2Tone"].fillna("").astype(str)
    df["combined"] = (
        df["title_norm"] + " " +
        df["Themes"].fillna("").astype(str).str.lower() + " " +
        df["V2Themes"].fillna("").astype(str).str.lower()
    )

    result = {}
    for tid in topics:
        anchor_mask = df["combined"].str.contains(PREPARED[tid]["anchor_re"], na=False)
        sub = df[anchor_mask].copy()
        if sub.empty:
            continue
        sub["score"] = sub["combined"].map(lambda x: _score(x, tid))
        sub = sub[sub["score"].notna()].copy()
        if sub.empty:
            continue
        result[tid] = sub[[
            "day_key", "use_ts", "source",
            "url_norm", "title_norm", "title",
            "V2Tone", "score"
        ]].to_dict("records")
    return result

Run pipeline

This section runs the full tile collection process across the requested period. Progress is tracked while tiles are processed in parallel, and all qualifying article rows are accumulated by topic and day.

The output of this section is an in-memory accumulator that is later transformed into daily NSI frames.

In [6]:

# if you are using a personal environemnt
# consider updating power settings to performance mode
# this may provide faster runs to collect data using threads

start_utc = _utc_midnight(START_DAY_UTC)
end_utc = _utc_midnight(END_DAY_UTC) + timedelta(days=1)
tiles = list(_iter_tiles(start_utc, end_utc))
accumulator = {tid: {} for tid in TOPICS}

print(f"Period:   {START_DAY_UTC} to {END_DAY_UTC}")
print(f"Tiles:    {len(tiles):,}")
print(f"Workers:  {MAX_WORKERS}")

t0 = time.time()
completed = 0
ok = 0
miss = 0
errors = 0

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(_process_tile, ts, start_utc, end_utc, TOPICS): ts for ts in tiles}
    for future in as_completed(futures):
        completed += 1
        topic_rows = future.result()

        if topic_rows:
            ok += 1
            for tid, rows in topic_rows.items():
                if not rows:
                    continue
                for row in rows:
                    dk = row["day_key"]
                    accumulator[tid].setdefault(dk, []).append(row)
        else:
            miss += 1

        if completed % PROGRESS_EVERY == 0 or completed == len(tiles):
            elapsed = time.time() - t0
            pct = completed / len(tiles) * 100
            print(
                f"Completed: {completed:>5,} | {len(tiles):,} "
                f"({pct:5.1f}%)  ok: {ok:,}  empty/miss: {miss:,}  elapsed: {elapsed:,.1f}s"
            )

run_summary = pd.DataFrame([
    {"Metric": "Start day", "Value": START_DAY_UTC},
    {"Metric": "End day", "Value": END_DAY_UTC},
    {"Metric": "Tiles requested", "Value": f"{len(tiles):,}"},
    {"Metric": "Tiles with data", "Value": f"{ok:,}"},
    {"Metric": "Tiles empty or missing", "Value": f"{miss:,}"},
    {"Metric": "Elapsed seconds", "Value": f"{time.time() - t0:,.1f}"},
])

print("Pipeline run summary")
display(run_summary)

Period:   2026-03-15 to 2026-04-01
Tiles:    1,728
Workers:  48
Completed:    96 | 1,728 (  5.6%)  ok: 96  empty/miss: 0  elapsed: 17.4s
Completed:   192 | 1,728 ( 11.1%)  ok: 192  empty/miss: 0  elapsed: 41.6s
Completed:   288 | 1,728 ( 16.7%)  ok: 288  empty/miss: 0  elapsed: 67.7s
Completed:   384 | 1,728 ( 22.2%)  ok: 384  empty/miss: 0  elapsed: 93.5s
Completed:   480 | 1,728 ( 27.8%)  ok: 480  empty/miss: 0  elapsed: 119.5s
Completed:   576 | 1,728 ( 33.3%)  ok: 576  empty/miss: 0  elapsed: 142.3s
Completed:   672 | 1,728 ( 38.9%)  ok: 672  empty/miss: 0  elapsed: 157.1s
Completed:   768 | 1,728 ( 44.4%)  ok: 768  empty/miss: 0  elapsed: 171.0s
Completed:   864 | 1,728 ( 50.0%)  ok: 864  empty/miss: 0  elapsed: 195.1s
Completed:   960 | 1,728 ( 55.6%)  ok: 960  empty/miss: 0  elapsed: 220.5s
Completed: 1,056 | 1,728 ( 61.1%)  ok: 1,056  empty/miss: 0  elapsed: 245.7s
Completed: 1,152 | 1,728 ( 66.7%)  ok: 1,152  empty/miss: 0  elapsed: 271.7s
Completed: 1,248 | 1,728 ( 72.2%)  ok: 1,248  empty/miss: 0  elapsed: 295.2s
Completed: 1,344 | 1,728 ( 77.8%)  ok: 1,344  empty/miss: 0  elapsed: 309.8s
Completed: 1,440 | 1,728 ( 83.3%)  ok: 1,440  empty/miss: 0  elapsed: 324.2s
Completed: 1,536 | 1,728 ( 88.9%)  ok: 1,536  empty/miss: 0  elapsed: 347.9s
Completed: 1,632 | 1,728 ( 94.4%)  ok: 1,632  empty/miss: 0  elapsed: 373.4s
Completed: 1,728 | 1,728 (100.0%)  ok: 1,728  empty/miss: 0  elapsed: 395.8s
Pipeline run summary

	Metric	Value
0	Start day	2026-03-15
1	End day	2026-04-01
2	Tiles requested	1,728
3	Tiles with data	1,728
4	Tiles empty or missing	0
5	Elapsed seconds	395.8

Build daily NSI and save to CSV

This stage converts the accumulated article rows into daily topic-level NSI data.

For each topic and day, the pipeline:

ranks articles by relevance score and timestamp
removes duplicate URLs and duplicate titles
keeps the top daily articles up to the configured daily limit
converts article tone into a daily Net Sentiment Index
computes a 7-day moving average
saves the result as a per-topic CSV

In [7]:

def _build_nsi(accumulator, tid, daily_limit):
    """Deduplicate by url+title, apply daily limit with score+time sort, compute NSI."""
    day_frames = []
    for dk, rows in accumulator[tid].items():
        df = pd.DataFrame(rows)
        # url_norm and title_norm already lowercased in _process_tile
        df["use_dt_sort"] = pd.to_datetime(df["use_ts"], errors="coerce", utc=True)
        df = df.sort_values(["score", "use_dt_sort"], ascending=[False, False])
        df = df.drop_duplicates(subset=["url_norm"],   keep="first")
        df = df.drop_duplicates(subset=["title_norm"], keep="first")
        day_frames.append(df.head(daily_limit))

    if not day_frames:
        return pd.DataFrame()

    df = pd.concat(day_frames, ignore_index=True)
    df["tone"] = df["V2Tone"].map(_tone_first)
    df = df[df["tone"].notna()].copy()

    daily = (
        df.groupby("day_key", as_index=False)
        .agg(
            article_count=("tone", "size"),
            positive_count=("tone", lambda s: int((s > 0).sum())),
            negative_count=("tone", lambda s: int((s < 0).sum())),
            mean_tone=("tone", "mean"),
            median_tone=("tone", "median"),
        )
        .sort_values("day_key")
        .reset_index(drop=True)
    )

    denom = daily["positive_count"] + daily["negative_count"]
    daily["nsi"] = 0.0
    valid = denom > 0
    daily.loc[valid, "nsi"] = (
        (daily.loc[valid, "positive_count"] - daily.loc[valid, "negative_count"]) / denom[valid]
    )

    daily["nsi_7d"]       = daily["nsi"].rolling(7, min_periods=1).mean()
    daily["mean_tone_7d"] = daily["mean_tone"].rolling(7, min_periods=1).mean()
    daily["topic_id"]     = tid
    return daily

In [8]:

nsi_frames = {}
save_rows = []

for tid in TOPICS:
    df = _build_nsi(accumulator, tid, DAILY_LIMIT)
    nsi_frames[tid] = df

    if not df.empty:
        path = Path(OUTPUT_DIR) / f"nsi_{tid}.csv"
        df.to_csv(path, index=False)

        save_rows.append({
            "Topic": tid,
            "Days": len(df),
            "Avg articles": round(df["article_count"].mean(), 1),
            "Avg NSI": round(df["nsi"].mean(), 4),
            "Latest NSI": round(df["nsi"].iloc[-1], 4),
            "Path": str(path),
        })
    else:
        save_rows.append({
            "Topic": tid,
            "Days": 0,
            "Avg articles": None,
            "Avg NSI": None,
            "Latest NSI": None,
            "Path": "no data",
        })

combined = pd.concat([df for df in nsi_frames.values() if df is not None and not df.empty], ignore_index=True)
combined_path = Path(OUTPUT_DIR) / "nsi_combined.csv"
combined.to_csv(combined_path, index=False)

save_summary = pd.DataFrame(save_rows)

print("Saved topic outputs")
display(save_summary)

combined_summary = pd.DataFrame([
    {"Metric": "Combined output path", "Value": str(combined_path)},
    {"Metric": "Combined rows", "Value": f"{len(combined):,}"}
])

display(combined_summary)

Saved topic outputs

	Topic	Days	Avg articles	Avg NSI	Latest NSI	Path
0	oil	18	100.0	-0.8451	-0.8571	nsi_output/nsi_oil.csv
1	crypto	18	100.0	-0.0805	-0.0909	nsi_output/nsi_crypto.csv
2	cybersecurity	18	100.0	-0.7007	-0.7113	nsi_output/nsi_cybersecurity.csv
3	semiconductors	18	84.8	0.3533	0.5824	nsi_output/nsi_semiconductors.csv

	Metric	Value
0	Combined output path	nsi_output/nsi_combined.csv
1	Combined rows	72

Results

daily NSI
7-day moving average of NSI

A compact summary table across topics is also displayed at the end.

In [9]:

PALETTE = {
    "bar": "#c8cdd6",
    "line": "#2563eb",
    "bg": "#ffffff",
    "grid": "#eef0f7",
    "text": "#0f172a",
    "text_light": "#64748b",
    "border": "#e2e8f0",
}

plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["DejaVu Sans", "Liberation Sans", "Arial"],
    "font.size": 11,
    "axes.facecolor": PALETTE["bg"],
    "figure.facecolor": PALETTE["bg"],
    "axes.edgecolor": PALETTE["border"],
    "axes.linewidth": 0.6,
    "axes.grid": True,
    "grid.color": PALETTE["grid"],
    "grid.linewidth": 0.5,
    "xtick.color": PALETTE["text_light"],
    "ytick.color": PALETTE["text_light"],
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "axes.spines.top": False,
    "axes.spines.right": False,
})

In [10]:

import matplotlib.dates as mdates

def _plot_topic_nsi(tid, df):
    dates = pd.to_datetime(df["day_key"])

    plt.figure(figsize=(8, 4.5), dpi=300)

    plt.bar(
        dates,
        df["nsi"],
        color=PALETTE["bar"],
        width=0.8,
        alpha=0.85,
        label="Daily NSI"
    )
    plt.plot(
        dates,
        df["nsi_7d"],
        color=PALETTE["line"],
        linewidth=1.8,
        label="7-day MA"
    )
    plt.axhline(0, color=PALETTE["border"], linewidth=0.8)

    plt.ylabel("NSI")
    plt.title(f"{tid.capitalize()} news sentiment index")

    ax = plt.gca()
    ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.2f"))
    ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=4, maxticks=6))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))

    plt.xticks(rotation=20, ha="right")
    plt.legend(fontsize=9)
    plt.tick_params(length=0)
    plt.tight_layout()
    plt.show()


import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

for tid in TOPICS:
    df = nsi_frames.get(tid)

    if df is None or df.empty:
        print(f"{tid}: no data to plot")
        continue

    topic_snapshot = pd.DataFrame([
        {"Metric": "Topic", "Value": tid},
        {"Metric": "Days", "Value": f"{len(df):,}"},
        {"Metric": "Latest NSI", "Value": f"{df['nsi'].iloc[-1]:+.3f}"},
        {"Metric": "Average NSI", "Value": f"{df['nsi'].mean():+.3f}"},
        {"Metric": "Average article count", "Value": f"{df['article_count'].mean():.1f}"},
    ])

    display(topic_snapshot)
    _plot_topic_nsi(tid, df)

	Metric	Value
0	Topic	oil
1	Days	18
2	Latest NSI	-0.857
3	Average NSI	-0.845
4	Average article count	100.0

No description has been provided for this image

	Metric	Value
0	Topic	crypto
1	Days	18
2	Latest NSI	-0.091
3	Average NSI	-0.080
4	Average article count	100.0

	Metric	Value
0	Topic	cybersecurity
1	Days	18
2	Latest NSI	-0.711
3	Average NSI	-0.701
4	Average article count	100.0

	Metric	Value
0	Topic	semiconductors
1	Days	18
2	Latest NSI	+0.582
3	Average NSI	+0.353
4	Average article count	84.8

In [11]:

rows = []
for tid in TOPICS:
    df = nsi_frames.get(tid)
    if df is None or df.empty:
        continue

    rows.append({
        "Topic": tid,
        "Days": len(df),
        "Avg articles": round(df["article_count"].mean(), 1),
        "Avg NSI": round(df["nsi"].mean(), 4),
        "Latest NSI": round(df["nsi"].iloc[-1], 4),
        "Avg tone": round(df["mean_tone"].mean(), 3),
    })

summary = pd.DataFrame(rows)

print("Topic summary")
display(summary)

Topic summary

	Topic	Days	Avg articles	Avg NSI	Latest NSI	Avg tone
0	oil	18	100.0	-0.8451	-0.8571	-3.469
1	crypto	18	100.0	-0.0805	-0.0909	-0.755
2	cybersecurity	18	100.0	-0.7007	-0.7113	-3.169
3	semiconductors	18	84.8	0.3533	0.5824	0.721