News Sentiment Index
This notebook builds a daily News Sentiment Index (NSI) from GDELT GKG v2 news coverage for a small set of themes: oil, crypto, cybersecurity, and semiconductors.
The pipeline downloads 15-minute GDELT tiles, filters articles by topic relevance, extracts article-level tone, and aggregates the result into a daily topic level sentiment series. Each topic is saved as a local CSV file, and the notebook also produces a combined summary table and topic charts.
Topics covered
- oil
- crypto
- cybersecurity
- semiconductors
Outputs
- per-topic CSV files saved locally
- one combined CSV file across all topics
- daily NSI and 7-day moving average plots
Configuration
The pipeline processes all 15-minute tiles between the start and end dates, scores article relevance by topic, and keeps a limited number of top articles per topic per day before computing the daily NSI.
START_DAY_UTC = "2026-03-15"
END_DAY_UTC = "2026-04-01" # inclusive
TOPICS = ["oil", "crypto", "cybersecurity", "semiconductors"]
MAX_WORKERS = 48
DAILY_LIMIT = 100 # max articles per topic per day
MIN_TITLE_WORDS = 4
OUTPUT_DIR = "nsi_output" # local directory for CSV files
PROGRESS_EVERY = 96 # print progress every N tiles (~1 day)
Imports
import io
import re
import html
import time
import zipfile
import threading
from pathlib import Path
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
Path(OUTPUT_DIR).mkdir(exist_ok=True)
#print(f"Output directory: {Path(OUTPUT_DIR).resolve()}")
Topic configurations
Each topic is defined with three layers of matching logic:
- anchor terms that identify whether an article is relevant at all
- positive terms that contribute to a topic relevance score
- positive phrases that receive additional weight when they appear exactly
These configurations are used to decide whether an article belongs to a topic and how strongly it should be ranked relative to other candidate articles from the same day.
TOPIC_CONFIGS = {
"oil": {
"anchors": [
"oil", "crude", "brent", "wti", "opec", "opec+", "eia", "iea",
"spr", "strategic petroleum reserve", "refinery", "refineries",
"diesel", "gasoline", "distillate", "inventory", "inventories"
],
"positive_terms": {
"oil": 2.0, "crude": 2.2, "brent": 2.0, "wti": 2.0,
"opec": 2.4, "opec+": 2.6, "eia": 2.4, "iea": 2.3,
"inventory": 1.8, "inventories": 1.8, "refinery": 1.5,
"refineries": 1.5, "spr": 2.0, "sanctions": 1.5,
"exports": 1.2, "production": 1.6, "output": 1.4,
"demand": 1.2, "supply": 1.2, "outage": 1.8,
"shutdown": 1.7, "attack": 1.5, "pipeline": 1.4, "terminal": 1.3
},
"positive_phrases": {
"oil prices": 1.5, "crude oil": 2.0, "eia inventories": 3.5,
"opec+": 3.0, "strategic petroleum reserve": 3.0,
"oil demand": 2.2, "oil supply": 2.2
},
"negative_terms": {
"olive": -5.0, "essential oil": -6.0, "fish oil": -6.0,
"cooking oil": -6.0, "beauty": -3.0, "cosmetic": -3.0
},
"min_score": 3.0
},
"crypto": {
"anchors": [
"crypto", "cryptocurrency", "cryptocurrencies", "bitcoin", "btc",
"ethereum", "ether", "eth", "stablecoin", "stablecoins", "token",
"tokens", "blockchain", "binance", "coinbase", "sec crypto",
"bitcoin etf", "crypto etf"
],
"positive_terms": {
"crypto": 2.0, "cryptocurrency": 2.2, "cryptocurrencies": 2.2,
"bitcoin": 2.8, "btc": 2.4, "ethereum": 2.6, "ether": 2.2,
"eth": 2.0, "stablecoin": 2.4, "stablecoins": 2.4,
"token": 1.4, "tokens": 1.4, "blockchain": 1.6,
"binance": 1.8, "coinbase": 1.8, "etf": 1.2,
"exchange": 1.1, "sec": 1.0, "regulation": 1.2,
"hack": 1.2, "custody": 1.1
},
"positive_phrases": {
"bitcoin etf": 3.2, "crypto etf": 2.8, "crypto exchange": 2.0,
"digital asset": 2.0, "digital assets": 2.0,
"crypto regulation": 2.6, "bitcoin treasury": 2.0
},
"negative_terms": {
"cryptography": -5.0, "encryption": -2.5,
"decrypt": -3.0, "cipher": -3.0
},
"min_score": 3.0
},
"cybersecurity": {
"anchors": [
"cyber", "cybersecurity", "cyberattack", "cyberattacks",
"ransomware", "malware", "data breach", "breach", "breaches",
"hack", "hacked", "hacking", "ddos", "exploit", "vulnerability",
"zero-day", "zero day", "phishing", "infosec"
],
"positive_terms": {
"cyber": 2.0, "cybersecurity": 2.4, "cyberattack": 2.8,
"cyberattacks": 2.8, "ransomware": 3.0, "malware": 2.5,
"breach": 2.4, "breaches": 2.4, "hack": 2.0,
"hacked": 2.0, "hacking": 2.0, "ddos": 2.2,
"exploit": 1.8, "vulnerability": 1.8, "zero-day": 2.2,
"phishing": 1.8, "cisa": 1.4, "nsa": 0.8,
"microsoft": 0.6, "firewall": 1.0
},
"positive_phrases": {
"data breach": 3.2, "critical vulnerability": 2.8,
"zero day": 3.0, "cyber attack": 3.0, "cyber attacks": 3.0,
"ransomware attack": 3.2, "security breach": 2.8
},
"negative_terms": {
"cybertruck": -6.0, "video game": -2.5, "gaming": -2.0
},
"min_score": 3.0
},
"semiconductors": {
"anchors": [
"semiconductor", "semiconductors", "chip", "chips", "chipmaker",
"chipmakers", "foundry", "foundries", "fab", "fabs", "wafer",
"wafers", "tsmc", "intel", "nvidia", "amd", "asml", "hbm", "dram",
"memory chip", "memory chips", "lithography"
],
"positive_terms": {
"semiconductor": 2.8, "semiconductors": 2.8, "chip": 1.8,
"chips": 1.8, "chipmaker": 2.0, "chipmakers": 2.0,
"foundry": 2.2, "foundries": 2.2, "fab": 1.8, "fabs": 1.8,
"wafer": 1.5, "wafers": 1.5, "tsmc": 2.2, "intel": 1.5,
"nvidia": 1.6, "amd": 1.4, "asml": 1.8, "hbm": 1.8,
"dram": 1.8, "memory": 1.0, "lithography": 2.0,
"export controls": 1.8, "capacity": 1.2
},
"positive_phrases": {
"semiconductor industry": 3.0, "memory chip": 2.6,
"memory chips": 2.6, "chip export": 2.2, "chip exports": 2.2,
"chip demand": 2.2, "chip supply": 2.2
},
"negative_terms": {
"potato chips": -7.0, "casino chips": -7.0,
"tortilla chips": -7.0, "fish and chips": -7.0
},
"min_score": 3.0
}
}
GDELT_V2_BASE_URL = "http://data.gdeltproject.org/gdeltv2"
GKG_USECOLS = [0, 1, 3, 4, 7, 8, 15, 26]
GKG_COL_NAMES = ["GKGRECORDID", "DATE", "SourceCommonName", "DocumentIdentifier",
"Themes", "V2Themes", "V2Tone", "Extras"]
THREAD_LOCAL = threading.local()
def _compile_topics(configs):
out = {}
for tid, cfg in configs.items():
anchors = [re.escape(str(x).lower()) for x in cfg["anchors"]]
out[tid] = {
"anchor_re": re.compile(
r"(?:" + "|".join(sorted(anchors, key=len, reverse=True)) + r")",
flags=re.IGNORECASE
),
"phrase_items": [(str(p).lower(), float(w)) for p, w in cfg["positive_phrases"].items()],
"positive_items": [(str(k).lower(), float(v)) for k, v in cfg["positive_terms"].items()],
"negative_items": [(str(k).lower(), float(v)) for k, v in cfg["negative_terms"].items()],
"min_score": float(cfg["min_score"]),
}
return out
PREPARED = _compile_topics(TOPIC_CONFIGS)
def _utc_midnight(day_str):
d = datetime.strptime(day_str, "%Y-%m-%d")
return d.replace(tzinfo=timezone.utc)
def _iter_tiles(start_utc, end_exclusive_utc):
cur = start_utc.replace(minute=(start_utc.minute // 15) * 15, second=0, microsecond=0)
while cur < end_exclusive_utc:
yield cur
cur += timedelta(minutes=15)
def _session():
s = getattr(THREAD_LOCAL, "session", None)
if s:
return s
s = requests.Session()
retry = Retry(total=3, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET"])
s.mount("http://", HTTPAdapter(max_retries=retry))
s.mount("https://", HTTPAdapter(max_retries=retry))
THREAD_LOCAL.session = s
return s
def _norm_domain(val):
s = str(val or "").strip().lower()
if "://" in s:
try:
s = urlparse(s).netloc
except Exception:
pass
if ":" in s:
s = s.split(":", 1)[0]
return s.lstrip("www.").strip(".")
def _extract_tag(extras, tag):
m = re.search(fr"<{tag}>(.*?)</{tag}>", str(extras or ""), re.DOTALL)
return m.group(1).strip() if m else ""
def _parse_ts(raw):
digits = re.sub(r"\D", "", str(raw or ""))
if len(digits) >= 14:
try:
return pd.to_datetime(digits[:14], format="%Y%m%d%H%M%S", utc=True)
except Exception:
pass
return pd.NaT
def _tone_first(val):
try:
return float(str(val or "").split(",", 1)[0])
except Exception:
return float("nan")
def _score(text, topic_id):
prep = PREPARED[topic_id]
t = str(text or "").lower()
if not prep["anchor_re"].search(t):
return None
score = sum(w for k, w in prep["positive_items"] if k in t)
score += sum(w * 1.2 for p, w in prep["phrase_items"] if p in t)
score += sum(w for k, w in prep["negative_items"] if k in t)
if any(c.isdigit() for c in t):
score += 0.3
return float(score) if score >= prep["min_score"] else None
def _process_tile(ts_utc, start_utc, end_excl_utc, topics):
url = f"{GDELT_V2_BASE_URL}/{ts_utc.strftime('%Y%m%d%H%M%S')}.gkg.csv.zip"
try:
resp = _session().get(url, timeout=20)
if resp.status_code != 200:
return {}
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
with zf.open(zf.namelist()[0]) as f:
raw = pd.read_csv(f, sep="\t", header=None, names=GKG_COL_NAMES,
usecols=GKG_USECOLS, low_memory=False, encoding="latin-1")
except Exception:
return {}
if raw is None or raw.empty:
return {}
df = raw.copy()
df["gdelt_ts"] = pd.to_datetime(
df["DATE"].fillna("").astype(str).str.split(".").str[0],
format="%Y%m%d%H%M%S", errors="coerce", utc=True
)
df = df.dropna(subset=["gdelt_ts"])
if df.empty:
return {}
df["Extras"] = df["Extras"].fillna("").astype(str)
def _extract_both(extras):
title = html.unescape(_extract_tag(extras, "PAGE_TITLE")).strip()
pub = _extract_tag(extras, "PAGE_PRECISEPUBTIMESTAMP")
return title, pub
extracted = df["Extras"].map(_extract_both)
df["title"] = extracted.map(lambda x: x[0])
df["pub_raw"] = extracted.map(lambda x: x[1])
df = df[df["title"].str.len() > 0].copy()
df = df[df["title"].str.split().str.len() >= MIN_TITLE_WORDS].copy()
if df.empty:
return {}
df["pub_ts"] = df["pub_raw"].map(_parse_ts)
df["use_ts"] = df["pub_ts"].where(df["pub_ts"].notna(), df["gdelt_ts"])
df["use_ts"] = pd.to_datetime(df["use_ts"], errors="coerce", utc=True)
df = df[df["use_ts"].notna()].copy()
mask = (df["use_ts"] >= start_utc) & (df["use_ts"] < end_excl_utc)
df = df[mask].copy()
if df.empty:
return {}
df["day_key"] = df["use_ts"].dt.strftime("%Y-%m-%d")
df["source"] = df["SourceCommonName"].fillna("").astype(str).map(_norm_domain)
df["url_norm"] = df["DocumentIdentifier"].fillna("").astype(str).str.strip().str.lower()
df["title_norm"] = df["title"].str.strip().str.lower()
df["V2Tone"] = df["V2Tone"].fillna("").astype(str)
df["combined"] = (
df["title_norm"] + " " +
df["Themes"].fillna("").astype(str).str.lower() + " " +
df["V2Themes"].fillna("").astype(str).str.lower()
)
result = {}
for tid in topics:
anchor_mask = df["combined"].str.contains(PREPARED[tid]["anchor_re"], na=False)
sub = df[anchor_mask].copy()
if sub.empty:
continue
sub["score"] = sub["combined"].map(lambda x: _score(x, tid))
sub = sub[sub["score"].notna()].copy()
if sub.empty:
continue
result[tid] = sub[[
"day_key", "use_ts", "source",
"url_norm", "title_norm", "title",
"V2Tone", "score"
]].to_dict("records")
return result
Run pipeline
This section runs the full tile collection process across the requested period. Progress is tracked while tiles are processed in parallel, and all qualifying article rows are accumulated by topic and day.
The output of this section is an in-memory accumulator that is later transformed into daily NSI frames.
# if you are using a personal environemnt
# consider updating power settings to performance mode
# this may provide faster runs to collect data using threads
start_utc = _utc_midnight(START_DAY_UTC)
end_utc = _utc_midnight(END_DAY_UTC) + timedelta(days=1)
tiles = list(_iter_tiles(start_utc, end_utc))
accumulator = {tid: {} for tid in TOPICS}
print(f"Period: {START_DAY_UTC} to {END_DAY_UTC}")
print(f"Tiles: {len(tiles):,}")
print(f"Workers: {MAX_WORKERS}")
t0 = time.time()
completed = 0
ok = 0
miss = 0
errors = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
futures = {ex.submit(_process_tile, ts, start_utc, end_utc, TOPICS): ts for ts in tiles}
for future in as_completed(futures):
completed += 1
topic_rows = future.result()
if topic_rows:
ok += 1
for tid, rows in topic_rows.items():
if not rows:
continue
for row in rows:
dk = row["day_key"]
accumulator[tid].setdefault(dk, []).append(row)
else:
miss += 1
if completed % PROGRESS_EVERY == 0 or completed == len(tiles):
elapsed = time.time() - t0
pct = completed / len(tiles) * 100
print(
f"Completed: {completed:>5,} | {len(tiles):,} "
f"({pct:5.1f}%) ok: {ok:,} empty/miss: {miss:,} elapsed: {elapsed:,.1f}s"
)
run_summary = pd.DataFrame([
{"Metric": "Start day", "Value": START_DAY_UTC},
{"Metric": "End day", "Value": END_DAY_UTC},
{"Metric": "Tiles requested", "Value": f"{len(tiles):,}"},
{"Metric": "Tiles with data", "Value": f"{ok:,}"},
{"Metric": "Tiles empty or missing", "Value": f"{miss:,}"},
{"Metric": "Elapsed seconds", "Value": f"{time.time() - t0:,.1f}"},
])
print("Pipeline run summary")
display(run_summary)
Period: 2026-03-15 to 2026-04-01 Tiles: 1,728 Workers: 48 Completed: 96 | 1,728 ( 5.6%) ok: 96 empty/miss: 0 elapsed: 17.4s Completed: 192 | 1,728 ( 11.1%) ok: 192 empty/miss: 0 elapsed: 41.6s Completed: 288 | 1,728 ( 16.7%) ok: 288 empty/miss: 0 elapsed: 67.7s Completed: 384 | 1,728 ( 22.2%) ok: 384 empty/miss: 0 elapsed: 93.5s Completed: 480 | 1,728 ( 27.8%) ok: 480 empty/miss: 0 elapsed: 119.5s Completed: 576 | 1,728 ( 33.3%) ok: 576 empty/miss: 0 elapsed: 142.3s Completed: 672 | 1,728 ( 38.9%) ok: 672 empty/miss: 0 elapsed: 157.1s Completed: 768 | 1,728 ( 44.4%) ok: 768 empty/miss: 0 elapsed: 171.0s Completed: 864 | 1,728 ( 50.0%) ok: 864 empty/miss: 0 elapsed: 195.1s Completed: 960 | 1,728 ( 55.6%) ok: 960 empty/miss: 0 elapsed: 220.5s Completed: 1,056 | 1,728 ( 61.1%) ok: 1,056 empty/miss: 0 elapsed: 245.7s Completed: 1,152 | 1,728 ( 66.7%) ok: 1,152 empty/miss: 0 elapsed: 271.7s Completed: 1,248 | 1,728 ( 72.2%) ok: 1,248 empty/miss: 0 elapsed: 295.2s Completed: 1,344 | 1,728 ( 77.8%) ok: 1,344 empty/miss: 0 elapsed: 309.8s Completed: 1,440 | 1,728 ( 83.3%) ok: 1,440 empty/miss: 0 elapsed: 324.2s Completed: 1,536 | 1,728 ( 88.9%) ok: 1,536 empty/miss: 0 elapsed: 347.9s Completed: 1,632 | 1,728 ( 94.4%) ok: 1,632 empty/miss: 0 elapsed: 373.4s Completed: 1,728 | 1,728 (100.0%) ok: 1,728 empty/miss: 0 elapsed: 395.8s Pipeline run summary
| Metric | Value | |
|---|---|---|
| 0 | Start day | 2026-03-15 |
| 1 | End day | 2026-04-01 |
| 2 | Tiles requested | 1,728 |
| 3 | Tiles with data | 1,728 |
| 4 | Tiles empty or missing | 0 |
| 5 | Elapsed seconds | 395.8 |
Build daily NSI and save to CSV
This stage converts the accumulated article rows into daily topic-level NSI data.
For each topic and day, the pipeline:
- ranks articles by relevance score and timestamp
- removes duplicate URLs and duplicate titles
- keeps the top daily articles up to the configured daily limit
- converts article tone into a daily Net Sentiment Index
- computes a 7-day moving average
- saves the result as a per-topic CSV
def _build_nsi(accumulator, tid, daily_limit):
"""Deduplicate by url+title, apply daily limit with score+time sort, compute NSI."""
day_frames = []
for dk, rows in accumulator[tid].items():
df = pd.DataFrame(rows)
# url_norm and title_norm already lowercased in _process_tile
df["use_dt_sort"] = pd.to_datetime(df["use_ts"], errors="coerce", utc=True)
df = df.sort_values(["score", "use_dt_sort"], ascending=[False, False])
df = df.drop_duplicates(subset=["url_norm"], keep="first")
df = df.drop_duplicates(subset=["title_norm"], keep="first")
day_frames.append(df.head(daily_limit))
if not day_frames:
return pd.DataFrame()
df = pd.concat(day_frames, ignore_index=True)
df["tone"] = df["V2Tone"].map(_tone_first)
df = df[df["tone"].notna()].copy()
daily = (
df.groupby("day_key", as_index=False)
.agg(
article_count=("tone", "size"),
positive_count=("tone", lambda s: int((s > 0).sum())),
negative_count=("tone", lambda s: int((s < 0).sum())),
mean_tone=("tone", "mean"),
median_tone=("tone", "median"),
)
.sort_values("day_key")
.reset_index(drop=True)
)
denom = daily["positive_count"] + daily["negative_count"]
daily["nsi"] = 0.0
valid = denom > 0
daily.loc[valid, "nsi"] = (
(daily.loc[valid, "positive_count"] - daily.loc[valid, "negative_count"]) / denom[valid]
)
daily["nsi_7d"] = daily["nsi"].rolling(7, min_periods=1).mean()
daily["mean_tone_7d"] = daily["mean_tone"].rolling(7, min_periods=1).mean()
daily["topic_id"] = tid
return daily
nsi_frames = {}
save_rows = []
for tid in TOPICS:
df = _build_nsi(accumulator, tid, DAILY_LIMIT)
nsi_frames[tid] = df
if not df.empty:
path = Path(OUTPUT_DIR) / f"nsi_{tid}.csv"
df.to_csv(path, index=False)
save_rows.append({
"Topic": tid,
"Days": len(df),
"Avg articles": round(df["article_count"].mean(), 1),
"Avg NSI": round(df["nsi"].mean(), 4),
"Latest NSI": round(df["nsi"].iloc[-1], 4),
"Path": str(path),
})
else:
save_rows.append({
"Topic": tid,
"Days": 0,
"Avg articles": None,
"Avg NSI": None,
"Latest NSI": None,
"Path": "no data",
})
combined = pd.concat([df for df in nsi_frames.values() if df is not None and not df.empty], ignore_index=True)
combined_path = Path(OUTPUT_DIR) / "nsi_combined.csv"
combined.to_csv(combined_path, index=False)
save_summary = pd.DataFrame(save_rows)
print("Saved topic outputs")
display(save_summary)
combined_summary = pd.DataFrame([
{"Metric": "Combined output path", "Value": str(combined_path)},
{"Metric": "Combined rows", "Value": f"{len(combined):,}"}
])
display(combined_summary)
Saved topic outputs
| Topic | Days | Avg articles | Avg NSI | Latest NSI | Path | |
|---|---|---|---|---|---|---|
| 0 | oil | 18 | 100.0 | -0.8451 | -0.8571 | nsi_output/nsi_oil.csv |
| 1 | crypto | 18 | 100.0 | -0.0805 | -0.0909 | nsi_output/nsi_crypto.csv |
| 2 | cybersecurity | 18 | 100.0 | -0.7007 | -0.7113 | nsi_output/nsi_cybersecurity.csv |
| 3 | semiconductors | 18 | 84.8 | 0.3533 | 0.5824 | nsi_output/nsi_semiconductors.csv |
| Metric | Value | |
|---|---|---|
| 0 | Combined output path | nsi_output/nsi_combined.csv |
| 1 | Combined rows | 72 |
Results
- daily NSI
- 7-day moving average of NSI
A compact summary table across topics is also displayed at the end.
PALETTE = {
"bar": "#c8cdd6",
"line": "#2563eb",
"bg": "#ffffff",
"grid": "#eef0f7",
"text": "#0f172a",
"text_light": "#64748b",
"border": "#e2e8f0",
}
plt.rcParams.update({
"font.family": "sans-serif",
"font.sans-serif": ["DejaVu Sans", "Liberation Sans", "Arial"],
"font.size": 11,
"axes.facecolor": PALETTE["bg"],
"figure.facecolor": PALETTE["bg"],
"axes.edgecolor": PALETTE["border"],
"axes.linewidth": 0.6,
"axes.grid": True,
"grid.color": PALETTE["grid"],
"grid.linewidth": 0.5,
"xtick.color": PALETTE["text_light"],
"ytick.color": PALETTE["text_light"],
"xtick.labelsize": 10,
"ytick.labelsize": 10,
"axes.spines.top": False,
"axes.spines.right": False,
})
import matplotlib.dates as mdates
def _plot_topic_nsi(tid, df):
dates = pd.to_datetime(df["day_key"])
plt.figure(figsize=(8, 4.5), dpi=300)
plt.bar(
dates,
df["nsi"],
color=PALETTE["bar"],
width=0.8,
alpha=0.85,
label="Daily NSI"
)
plt.plot(
dates,
df["nsi_7d"],
color=PALETTE["line"],
linewidth=1.8,
label="7-day MA"
)
plt.axhline(0, color=PALETTE["border"], linewidth=0.8)
plt.ylabel("NSI")
plt.title(f"{tid.capitalize()} news sentiment index")
ax = plt.gca()
ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.2f"))
ax.xaxis.set_major_locator(mdates.AutoDateLocator(minticks=4, maxticks=6))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))
plt.xticks(rotation=20, ha="right")
plt.legend(fontsize=9)
plt.tick_params(length=0)
plt.tight_layout()
plt.show()
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
for tid in TOPICS:
df = nsi_frames.get(tid)
if df is None or df.empty:
print(f"{tid}: no data to plot")
continue
topic_snapshot = pd.DataFrame([
{"Metric": "Topic", "Value": tid},
{"Metric": "Days", "Value": f"{len(df):,}"},
{"Metric": "Latest NSI", "Value": f"{df['nsi'].iloc[-1]:+.3f}"},
{"Metric": "Average NSI", "Value": f"{df['nsi'].mean():+.3f}"},
{"Metric": "Average article count", "Value": f"{df['article_count'].mean():.1f}"},
])
display(topic_snapshot)
_plot_topic_nsi(tid, df)
| Metric | Value | |
|---|---|---|
| 0 | Topic | oil |
| 1 | Days | 18 |
| 2 | Latest NSI | -0.857 |
| 3 | Average NSI | -0.845 |
| 4 | Average article count | 100.0 |
| Metric | Value | |
|---|---|---|
| 0 | Topic | crypto |
| 1 | Days | 18 |
| 2 | Latest NSI | -0.091 |
| 3 | Average NSI | -0.080 |
| 4 | Average article count | 100.0 |
| Metric | Value | |
|---|---|---|
| 0 | Topic | cybersecurity |
| 1 | Days | 18 |
| 2 | Latest NSI | -0.711 |
| 3 | Average NSI | -0.701 |
| 4 | Average article count | 100.0 |
| Metric | Value | |
|---|---|---|
| 0 | Topic | semiconductors |
| 1 | Days | 18 |
| 2 | Latest NSI | +0.582 |
| 3 | Average NSI | +0.353 |
| 4 | Average article count | 84.8 |
rows = []
for tid in TOPICS:
df = nsi_frames.get(tid)
if df is None or df.empty:
continue
rows.append({
"Topic": tid,
"Days": len(df),
"Avg articles": round(df["article_count"].mean(), 1),
"Avg NSI": round(df["nsi"].mean(), 4),
"Latest NSI": round(df["nsi"].iloc[-1], 4),
"Avg tone": round(df["mean_tone"].mean(), 3),
})
summary = pd.DataFrame(rows)
print("Topic summary")
display(summary)
Topic summary
| Topic | Days | Avg articles | Avg NSI | Latest NSI | Avg tone | |
|---|---|---|---|---|---|---|
| 0 | oil | 18 | 100.0 | -0.8451 | -0.8571 | -3.469 |
| 1 | crypto | 18 | 100.0 | -0.0805 | -0.0909 | -0.755 |
| 2 | cybersecurity | 18 | 100.0 | -0.7007 | -0.7113 | -3.169 |
| 3 | semiconductors | 18 | 84.8 | 0.3533 | 0.5824 | 0.721 |