Ctrl K

News Sentiment Index

This notebook builds a daily News Sentiment Index (NSI) from GDELT GKG v2 news coverage for a small set of themes: oil, crypto, cybersecurity, and semiconductors.

The pipeline downloads 15-minute GDELT tiles, filters articles by topic relevance, extracts article-level tone, and aggregates the result into a daily topic level sentiment series. Each topic is saved as a local CSV file, and the notebook also produces a combined summary table and topic charts.

Topics covered

  • oil
  • crypto
  • cybersecurity
  • semiconductors

Outputs

  • per-topic CSV files saved locally
  • one combined CSV file across all topics
  • daily NSI and 7-day moving average plots

Configuration

The pipeline processes all 15-minute tiles between the start and end dates, scores article relevance by topic, and keeps a limited number of top articles per topic per day before computing the daily NSI.

In [1]:
START_DAY_UTC   = "2026-03-15"
END_DAY_UTC     = "2026-04-01"   # inclusive
TOPICS          = ["oil", "crypto", "cybersecurity", "semiconductors"]
MAX_WORKERS     = 48
DAILY_LIMIT     = 100            # max articles per topic per day
MIN_TITLE_WORDS = 4
OUTPUT_DIR      = "nsi_output"   # local directory for CSV files
PROGRESS_EVERY  = 96             # print progress every N tiles (~1 day)

Imports

In [2]:
import io
import re
import html
import time
import zipfile
import threading
from pathlib import Path
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

Path(OUTPUT_DIR).mkdir(exist_ok=True)
#print(f"Output directory: {Path(OUTPUT_DIR).resolve()}")

Topic configurations

Each topic is defined with three layers of matching logic:

  • anchor terms that identify whether an article is relevant at all
  • positive terms that contribute to a topic relevance score
  • positive phrases that receive additional weight when they appear exactly

These configurations are used to decide whether an article belongs to a topic and how strongly it should be ranked relative to other candidate articles from the same day.

In [3]:
TOPIC_CONFIGS = {
    "oil": {
        "anchors": [
            "oil", "crude", "brent", "wti", "opec", "opec+", "eia", "iea",
            "spr", "strategic petroleum reserve", "refinery", "refineries",
            "diesel", "gasoline", "distillate", "inventory", "inventories"
        ],
        "positive_terms": {
            "oil": 2.0, "crude": 2.2, "brent": 2.0, "wti": 2.0,
            "opec": 2.4, "opec+": 2.6, "eia": 2.4, "iea": 2.3,
            "inventory": 1.8, "inventories": 1.8, "refinery": 1.5,
            "refineries": 1.5, "spr": 2.0, "sanctions": 1.5,
            "exports": 1.2, "production": 1.6, "output": 1.4,
            "demand": 1.2, "supply": 1.2, "outage": 1.8,
            "shutdown": 1.7, "attack": 1.5, "pipeline": 1.4, "terminal": 1.3
        },
        "positive_phrases": {
            "oil prices": 1.5, "crude oil": 2.0, "eia inventories": 3.5,
            "opec+": 3.0, "strategic petroleum reserve": 3.0,
            "oil demand": 2.2, "oil supply": 2.2
        },
        "negative_terms": {
            "olive": -5.0, "essential oil": -6.0, "fish oil": -6.0,
            "cooking oil": -6.0, "beauty": -3.0, "cosmetic": -3.0
        },
        "min_score": 3.0
    },
    "crypto": {
        "anchors": [
            "crypto", "cryptocurrency", "cryptocurrencies", "bitcoin", "btc",
            "ethereum", "ether", "eth", "stablecoin", "stablecoins", "token",
            "tokens", "blockchain", "binance", "coinbase", "sec crypto",
            "bitcoin etf", "crypto etf"
        ],
        "positive_terms": {
            "crypto": 2.0, "cryptocurrency": 2.2, "cryptocurrencies": 2.2,
            "bitcoin": 2.8, "btc": 2.4, "ethereum": 2.6, "ether": 2.2,
            "eth": 2.0, "stablecoin": 2.4, "stablecoins": 2.4,
            "token": 1.4, "tokens": 1.4, "blockchain": 1.6,
            "binance": 1.8, "coinbase": 1.8, "etf": 1.2,
            "exchange": 1.1, "sec": 1.0, "regulation": 1.2,
            "hack": 1.2, "custody": 1.1
        },
        "positive_phrases": {
            "bitcoin etf": 3.2, "crypto etf": 2.8, "crypto exchange": 2.0,
            "digital asset": 2.0, "digital assets": 2.0,
            "crypto regulation": 2.6, "bitcoin treasury": 2.0
        },
        "negative_terms": {
            "cryptography": -5.0, "encryption": -2.5,
            "decrypt": -3.0, "cipher": -3.0
        },
        "min_score": 3.0
    },
    "cybersecurity": {
        "anchors": [
            "cyber", "cybersecurity", "cyberattack", "cyberattacks",
            "ransomware", "malware", "data breach", "breach", "breaches",
            "hack", "hacked", "hacking", "ddos", "exploit", "vulnerability",
            "zero-day", "zero day", "phishing", "infosec"
        ],
        "positive_terms": {
            "cyber": 2.0, "cybersecurity": 2.4, "cyberattack": 2.8,
            "cyberattacks": 2.8, "ransomware": 3.0, "malware": 2.5,
            "breach": 2.4, "breaches": 2.4, "hack": 2.0,
            "hacked": 2.0, "hacking": 2.0, "ddos": 2.2,
            "exploit": 1.8, "vulnerability": 1.8, "zero-day": 2.2,
            "phishing": 1.8, "cisa": 1.4, "nsa": 0.8,
            "microsoft": 0.6, "firewall": 1.0
        },
        "positive_phrases": {
            "data breach": 3.2, "critical vulnerability": 2.8,
            "zero day": 3.0, "cyber attack": 3.0, "cyber attacks": 3.0,
            "ransomware attack": 3.2, "security breach": 2.8
        },
        "negative_terms": {
            "cybertruck": -6.0, "video game": -2.5, "gaming": -2.0
        },
        "min_score": 3.0
    },
    "semiconductors": {
        "anchors": [
            "semiconductor", "semiconductors", "chip", "chips", "chipmaker",
            "chipmakers", "foundry", "foundries", "fab", "fabs", "wafer",
            "wafers", "tsmc", "intel", "nvidia", "amd", "asml", "hbm", "dram",
            "memory chip", "memory chips", "lithography"
        ],
        "positive_terms": {
            "semiconductor": 2.8, "semiconductors": 2.8, "chip": 1.8,
            "chips": 1.8, "chipmaker": 2.0, "chipmakers": 2.0,
            "foundry": 2.2, "foundries": 2.2, "fab": 1.8, "fabs": 1.8,
            "wafer": 1.5, "wafers": 1.5, "tsmc": 2.2, "intel": 1.5,
            "nvidia": 1.6, "amd": 1.4, "asml": 1.8, "hbm": 1.8,
            "dram": 1.8, "memory": 1.0, "lithography": 2.0,
            "export controls": 1.8, "capacity": 1.2
        },
        "positive_phrases": {
            "semiconductor industry": 3.0, "memory chip": 2.6,
            "memory chips": 2.6, "chip export": 2.2, "chip exports": 2.2,
            "chip demand": 2.2, "chip supply": 2.2
        },
        "negative_terms": {
            "potato chips": -7.0, "casino chips": -7.0,
            "tortilla chips": -7.0, "fish and chips": -7.0
        },
        "min_score": 3.0
    }
}
In [4]:
GDELT_V2_BASE_URL = "http://data.gdeltproject.org/gdeltv2"
GKG_USECOLS   = [0, 1, 3, 4, 7, 8, 15, 26]
GKG_COL_NAMES = ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier",
                  "Themes", "V2Themes", "V2Tone", "Extras"]
THREAD_LOCAL  = threading.local()

def _compile_topics(configs):
    out = {}
    for tid, cfg in configs.items():
        anchors = [re.escape(str(x).lower()) for x in cfg["anchors"]]
        out[tid] = {
            "anchor_re": re.compile(
                r"(?:" + "|".join(sorted(anchors, key=len, reverse=True)) + r")",
                flags=re.IGNORECASE
            ),
            "phrase_items":   [(str(p).lower(), float(w)) for p, w in cfg["positive_phrases"].items()],
            "positive_items": [(str(k).lower(), float(v)) for k, v in cfg["positive_terms"].items()],
            "negative_items": [(str(k).lower(), float(v)) for k, v in cfg["negative_terms"].items()],
            "min_score":      float(cfg["min_score"]),
        }
    return out

PREPARED = _compile_topics(TOPIC_CONFIGS)

def _utc_midnight(day_str):
    d = datetime.strptime(day_str, "%Y-%m-%d")
    return d.replace(tzinfo=timezone.utc)

def _iter_tiles(start_utc, end_exclusive_utc):
    cur = start_utc.replace(minute=(start_utc.minute // 15) * 15, second=0, microsecond=0)
    while cur < end_exclusive_utc:
        yield cur
        cur += timedelta(minutes=15)

def _session():
    s = getattr(THREAD_LOCAL, "session", None)
    if s:
        return s
    s = requests.Session()
    retry = Retry(total=3, backoff_factor=1,
                  status_forcelist=[429, 500, 502, 503, 504],
                  allowed_methods=["GET"])
    s.mount("http://", HTTPAdapter(max_retries=retry))
    s.mount("https://", HTTPAdapter(max_retries=retry))
    THREAD_LOCAL.session = s
    return s

def _norm_domain(val):
    s = str(val or "").strip().lower()
    if "://" in s:
        try:
            s = urlparse(s).netloc
        except Exception:
            pass
    if ":" in s:
        s = s.split(":", 1)[0]
    return s.lstrip("www.").strip(".")

def _extract_tag(extras, tag):
    m = re.search(fr"<{tag}>(.*?)</{tag}>", str(extras or ""), re.DOTALL)
    return m.group(1).strip() if m else ""

def _parse_ts(raw):
    digits = re.sub(r"\D", "", str(raw or ""))
    if len(digits) >= 14:
        try:
            return pd.to_datetime(digits[:14], format="%Y%m%d%H%M%S", utc=True)
        except Exception:
            pass
    return pd.NaT

def _tone_first(val):
    try:
        return float(str(val or "").split(",", 1)[0])
    except Exception:
        return float("nan")

def _score(text, topic_id):
    prep = PREPARED[topic_id]
    t = str(text or "").lower()
    if not prep["anchor_re"].search(t):
        return None
    score = sum(w for k, w in prep["positive_items"] if k in t)
    score += sum(w * 1.2 for p, w in prep["phrase_items"] if p in t)
    score += sum(w for k, w in prep["negative_items"] if k in t)
    if any(c.isdigit() for c in t):
        score += 0.3
    return float(score) if score >= prep["min_score"] else None
In [5]:
def _process_tile(ts_utc, start_utc, end_excl_utc, topics):
    url = f"{GDELT_V2_BASE_URL}/{ts_utc.strftime('%Y%m%d%H%M%S')}.gkg.csv.zip"
    try:
        resp = _session().get(url, timeout=20)
        if resp.status_code != 200:
            return {}
        with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
            with zf.open(zf.namelist()[0]) as f:
                raw = pd.read_csv(f, sep="\t", header=None, names=GKG_COL_NAMES,
                                  usecols=GKG_USECOLS, low_memory=False, encoding="latin-1")
    except Exception:
        return {}

    if raw is None or raw.empty:
        return {}

    df = raw.copy()
    df["gdelt_ts"] = pd.to_datetime(
        df["DATE"].fillna("").astype(str).str.split(".").str[0],
        format="%Y%m%d%H%M%S", errors="coerce", utc=True
    )
    df = df.dropna(subset=["gdelt_ts"])
    if df.empty:
        return {}

    df["Extras"] = df["Extras"].fillna("").astype(str)
    def _extract_both(extras):
        title = html.unescape(_extract_tag(extras, "PAGE_TITLE")).strip()
        pub   = _extract_tag(extras, "PAGE_PRECISEPUBTIMESTAMP")
        return title, pub
    extracted        = df["Extras"].map(_extract_both)
    df["title"]      = extracted.map(lambda x: x[0])
    df["pub_raw"]    = extracted.map(lambda x: x[1])

    df = df[df["title"].str.len() > 0].copy()
    df = df[df["title"].str.split().str.len() >= MIN_TITLE_WORDS].copy()
    if df.empty:
        return {}

    df["pub_ts"]  = df["pub_raw"].map(_parse_ts)
    df["use_ts"]  = df["pub_ts"].where(df["pub_ts"].notna(), df["gdelt_ts"])
    df["use_ts"]  = pd.to_datetime(df["use_ts"], errors="coerce", utc=True)
    df = df[df["use_ts"].notna()].copy()
    mask = (df["use_ts"] >= start_utc) & (df["use_ts"] < end_excl_utc)
    df = df[mask].copy()
    if df.empty:
        return {}

    df["day_key"] = df["use_ts"].dt.strftime("%Y-%m-%d")
    df["source"]  = df["SourceCommonName"].fillna("").astype(str).map(_norm_domain)
    df["url_norm"]   = df["DocumentIdentifier"].fillna("").astype(str).str.strip().str.lower()
    df["title_norm"] = df["title"].str.strip().str.lower()
    df["V2Tone"]  = df["V2Tone"].fillna("").astype(str)
    df["combined"] = (
        df["title_norm"] + " " +
        df["Themes"].fillna("").astype(str).str.lower() + " " +
        df["V2Themes"].fillna("").astype(str).str.lower()
    )

    result = {}
    for tid in topics:
        anchor_mask = df["combined"].str.contains(PREPARED[tid]["anchor_re"], na=False)
        sub = df[anchor_mask].copy()
        if sub.empty:
            continue
        sub["score"] = sub["combined"].map(lambda x: _score(x, tid))
        sub = sub[sub["score"].notna()].copy()
        if sub.empty:
            continue
        result[tid] = sub[[
            "day_key", "use_ts", "source",
            "url_norm", "title_norm", "title",
            "V2Tone", "score"
        ]].to_dict("records")
    return result

Run pipeline

This section runs the full tile collection process across the requested period. Progress is tracked while tiles are processed in parallel, and all qualifying article rows are accumulated by topic and day.

The output of this section is an in-memory accumulator that is later transformed into daily NSI frames.

In [6]:
# if you are using a personal environemnt
# consider updating power settings to performance mode
# this may provide faster runs to collect data using threads

start_utc = _utc_midnight(START_DAY_UTC)
end_utc = _utc_midnight(END_DAY_UTC) + timedelta(days=1)
tiles = list(_iter_tiles(start_utc, end_utc))
accumulator = {tid: {} for tid in TOPICS}

print(f"Period:   {START_DAY_UTC} to {END_DAY_UTC}")
print(f"Tiles:    {len(tiles):,}")
print(f"Workers:  {MAX_WORKERS}")

t0 = time.time()
completed = 0
ok = 0
miss = 0
errors = 0

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futures = {ex.submit(_process_tile, ts, start_utc, end_utc, TOPICS): ts for ts in tiles}
    for future in as_completed(futures):
        completed += 1
        topic_rows = future.result()

        if topic_rows:
            ok += 1
            for tid, rows in topic_rows.items():
                if not rows:
                    continue
                for row in rows:
                    dk = row["day_key"]
                    accumulator[tid].setdefault(dk, []).append(row)
        else:
            miss += 1

        if completed % PROGRESS_EVERY == 0 or completed == len(tiles):
            elapsed = time.time() - t0
            pct = completed / len(tiles) * 100
            print(
                f"Completed: {completed:>5,} | {len(tiles):,} "
                f"({pct:5.1f}%)  ok: {ok:,}  empty/miss: {miss:,}  elapsed: {elapsed:,.1f}s"
            )

run_summary = pd.DataFrame([
    {"Metric": "Start day", "Value": START_DAY_UTC},
    {"Metric": "End day", "Value": END_DAY_UTC},
    {"Metric": "Tiles requested", "Value": f"{len(tiles):,}"},
    {"Metric": "Tiles with data", "Value": f"{ok:,}"},
    {"Metric": "Tiles empty or missing", "Value": f"{miss:,}"},
    {"Metric": "Elapsed seconds", "Value": f"{time.time() - t0:,.1f}"},
])

print("Pipeline run summary")
display(run_summary)
Period:   2026-03-15 to 2026-04-01
Tiles:    1,728
Workers:  48
Completed:    96 | 1,728 (  5.6%)  ok: 96  empty/miss: 0  elapsed: 17.4s
Completed:   192 | 1,728 ( 11.1%)  ok: 192  empty/miss: 0  elapsed: 41.6s
Completed:   288 | 1,728 ( 16.7%)  ok: 288  empty/miss: 0  elapsed: 67.7s
Completed:   384 | 1,728 ( 22.2%)  ok: 384  empty/miss: 0  elapsed: 93.5s
Completed:   480 | 1,728 ( 27.8%)  ok: 480  empty/miss: 0  elapsed: 119.5s
Completed:   576 | 1,728 ( 33.3%)  ok: 576  empty/miss: 0  elapsed: 142.3s
Completed:   672 | 1,728 ( 38.9%)  ok: 672  empty/miss: 0  elapsed: 157.1s
Completed:   768 | 1,728 ( 44.4%)  ok: 768  empty/miss: 0  elapsed: 171.0s
Completed:   864 | 1,728 ( 50.0%)  ok: 864  empty/miss: 0  elapsed: 195.1s
Completed:   960 | 1,728 ( 55.6%)  ok: 960  empty/miss: 0  elapsed: 220.5s
Completed: 1,056 | 1,728 ( 61.1%)  ok: 1,056  empty/miss: 0  elapsed: 245.7s
Completed: 1,152 | 1,728 ( 66.7%)  ok: 1,152  empty/miss: 0  elapsed: 271.7s
Completed: 1,248 | 1,728 ( 72.2%)  ok: 1,248  empty/miss: 0  elapsed: 295.2s
Completed: 1,344 | 1,728 ( 77.8%)  ok: 1,344  empty/miss: 0  elapsed: 309.8s
Completed: 1,440 | 1,728 ( 83.3%)  ok: 1,440  empty/miss: 0  elapsed: 324.2s
Completed: 1,536 | 1,728 ( 88.9%)  ok: 1,536  empty/miss: 0  elapsed: 347.9s
Completed: 1,632 | 1,728 ( 94.4%)  ok: 1,632  empty/miss: 0  elapsed: 373.4s
Completed: 1,728 | 1,728 (100.0%)  ok: 1,728  empty/miss: 0  elapsed: 395.8s
Pipeline run summary
Metric Value
0 Start day 2026-03-15
1 End day 2026-04-01
2 Tiles requested 1,728
3 Tiles with data 1,728
4 Tiles empty or missing 0
5 Elapsed seconds 395.8

Build daily NSI and save to CSV

This stage converts the accumulated article rows into daily topic-level NSI data.

For each topic and day, the pipeline:

  • ranks articles by relevance score and timestamp
  • removes duplicate URLs and duplicate titles
  • keeps the top daily articles up to the configured daily limit
  • converts article tone into a daily Net Sentiment Index
  • computes a 7-day moving average
  • saves the result as a per-topic CSV
In [7]:
def _build_nsi(accumulator, tid, daily_limit):
    """Deduplicate by url+title, apply daily limit with score+time sort, compute NSI."""
    day_frames = []
    for dk, rows in accumulator[tid].items():
        df = pd.DataFrame(rows)
        # url_norm and title_norm already lowercased in _process_tile
        df["use_dt_sort"] = pd.to_datetime(df["use_ts"], errors="coerce", utc=True)
        df = df.sort_values(["score", "use_dt_sort"], ascending=[False, False])
        df = df.drop_duplicates(subset=["url_norm"],   keep="first")
        df = df.drop_duplicates(subset=["title_norm"], keep="first")
        day_frames.append(df.head(daily_limit))

    if not day_frames:
        return pd.DataFrame()

    df = pd.concat(day_frames, ignore_index=True)
    df["tone"] = df["V2Tone"].map(_tone_first)
    df = df[df["tone"].notna()].copy()

    daily = (
        df.groupby("day_key", as_index=False)
        .agg(
            article_count=("tone", "size"),
            positive_count=("tone", lambda s: int((s > 0).sum())),
            negative_count=("tone", lambda s: int((s < 0).sum())),
            mean_tone=("tone", "mean"),
            median_tone=("tone", "median"),
        )
        .sort_values("day_key")
        .reset_index(drop=True)
    )

    denom = daily["positive_count"] + daily["negative_count"]
    daily["nsi"] = 0.0
    valid = denom > 0
    daily.loc[valid, "nsi"] = (
        (daily.loc[valid, "positive_count"] - daily.loc[valid, "negative_count"]) / denom[valid]
    )

    daily["nsi_7d"]       = daily["nsi"].rolling(7, min_periods=1).mean()
    daily["mean_tone_7d"] = daily["mean_tone"].rolling(7, min_periods=1).mean()
    daily["topic_id"]     = tid
    return daily
In [8]:
nsi_frames = {}
save_rows = []

for tid in TOPICS:
    df = _build_nsi(accumulator, tid, DAILY_LIMIT)
    nsi_frames[tid] = df

    if not df.empty:
        path = Path(OUTPUT_DIR) / f"nsi_{tid}.csv"
        df.to_csv(path, index=False)

        save_rows.append({
            "Topic": tid,
            "Days": len(df),
            "Avg articles": round(df["article_count"].mean(), 1),
            "Avg NSI": round(df["nsi"].mean(), 4),
            "Latest NSI": round(df["nsi"].iloc[-1], 4),
            "Path": str(path),
        })
    else:
        save_rows.append({
            "Topic": tid,
            "Days": 0,
            "Avg articles": None,
            "Avg NSI": None,
            "Latest NSI": None,
            "Path": "no data",
        })

combined = pd.concat([df for df in nsi_frames.values() if df is not None and not df.empty], ignore_index=True)
combined_path = Path(OUTPUT_DIR) / "nsi_combined.csv"
combined.to_csv(combined_path, index=False)

save_summary = pd.DataFrame(save_rows)

print("Saved topic outputs")
display(save_summary)

combined_summary = pd.DataFrame([
    {"Metric": "Combined output path", "Value": str(combined_path)},
    {"Metric": "Combined rows", "Value": f"{len(combined):,}"}
])

display(combined_summary)
Saved topic outputs
Topic Days Avg articles Avg NSI Latest NSI Path
0 oil 18 100.0 -0.8451 -0.8571 nsi_output/nsi_oil.csv
1 crypto 18 100.0 -0.0805 -0.0909 nsi_output/nsi_crypto.csv
2 cybersecurity 18 100.0 -0.7007 -0.7113 nsi_output/nsi_cybersecurity.csv
3 semiconductors 18 84.8 0.3533 0.5824 nsi_output/nsi_semiconductors.csv
Metric Value
0 Combined output path nsi_output/nsi_combined.csv
1 Combined rows 72

Results

  • daily NSI
  • 7-day moving average of NSI

A compact summary table across topics is also displayed at the end.

In [9]:
PALETTE = {
    "bar": "#c8cdd6",
    "line": "#2563eb",
    "bg": "#ffffff",
    "grid": "#eef0f7",
    "text": "#0f172a",
    "text_light": "#64748b",
    "border": "#e2e8f0",
}

plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["DejaVu Sans", "Liberation Sans", "Arial"],
    "font.size": 11,
    "axes.facecolor": PALETTE["bg"],
    "figure.facecolor": PALETTE["bg"],
    "axes.edgecolor": PALETTE["border"],
    "axes.linewidth": 0.6,
    "axes.grid": True,
    "grid.color": PALETTE["grid"],
    "grid.linewidth": 0.5,
    "xtick.color": PALETTE["text_light"],
    "ytick.color": PALETTE["text_light"],
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "axes.spines.top": False,
    "axes.spines.right": False,
})
In [10]:
import matplotlib.dates as mdates

def _plot_topic_nsi(tid, df):
    dates = pd.to_datetime(df["day_key"])

    plt.figure(figsize=(8, 4.5), dpi=300)

    plt.bar(
        dates,
        df["nsi"],
        color=PALETTE["bar"],
        width=0.8,
        alpha=0.85,
        label="Daily NSI"
    )
    plt.plot(
        dates,
        df["nsi_7d"],
        color=PALETTE["line"],
        linewidth=1.8,
        label="7-day MA"
    )
    plt.axhline(0, color=PALETTE["border"], linewidth=0.8)

    plt.ylabel("NSI")
    plt.title(f"{tid.capitalize()} news sentiment index")

    ax = plt.gca()
    ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.2f"))
    ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=4, maxticks=6))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))

    plt.xticks(rotation=20, ha="right")
    plt.legend(fontsize=9)
    plt.tick_params(length=0)
    plt.tight_layout()
    plt.show()


import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

for tid in TOPICS:
    df = nsi_frames.get(tid)

    if df is None or df.empty:
        print(f"{tid}: no data to plot")
        continue

    topic_snapshot = pd.DataFrame([
        {"Metric": "Topic", "Value": tid},
        {"Metric": "Days", "Value": f"{len(df):,}"},
        {"Metric": "Latest NSI", "Value": f"{df['nsi'].iloc[-1]:+.3f}"},
        {"Metric": "Average NSI", "Value": f"{df['nsi'].mean():+.3f}"},
        {"Metric": "Average article count", "Value": f"{df['article_count'].mean():.1f}"},
    ])

    display(topic_snapshot)
    _plot_topic_nsi(tid, df)
Metric Value
0 Topic oil
1 Days 18
2 Latest NSI -0.857
3 Average NSI -0.845
4 Average article count 100.0
No description has been provided for this image
Metric Value
0 Topic crypto
1 Days 18
2 Latest NSI -0.091
3 Average NSI -0.080
4 Average article count 100.0
No description has been provided for this image
Metric Value
0 Topic cybersecurity
1 Days 18
2 Latest NSI -0.711
3 Average NSI -0.701
4 Average article count 100.0
No description has been provided for this image
Metric Value
0 Topic semiconductors
1 Days 18
2 Latest NSI +0.582
3 Average NSI +0.353
4 Average article count 84.8
No description has been provided for this image
In [11]:
rows = []
for tid in TOPICS:
    df = nsi_frames.get(tid)
    if df is None or df.empty:
        continue

    rows.append({
        "Topic": tid,
        "Days": len(df),
        "Avg articles": round(df["article_count"].mean(), 1),
        "Avg NSI": round(df["nsi"].mean(), 4),
        "Latest NSI": round(df["nsi"].iloc[-1], 4),
        "Avg tone": round(df["mean_tone"].mean(), 3),
    })

summary = pd.DataFrame(rows)

print("Topic summary")
display(summary)
Topic summary
Topic Days Avg articles Avg NSI Latest NSI Avg tone
0 oil 18 100.0 -0.8451 -0.8571 -3.469
1 crypto 18 100.0 -0.0805 -0.0909 -0.755
2 cybersecurity 18 100.0 -0.7007 -0.7113 -3.169
3 semiconductors 18 84.8 0.3533 0.5824 0.721