Bootstrap Resampling
A pedagogical walkthrough of bootstrap resampling using real S&P 500 data.
The analysis proceeds in five steps:
- Historical data: SPY price history and monthly returns
- Return distribution: histogram of monthly returns and summary statistics
- Simulation: 1,000 bootstrap paths over 5 years
- Uncertainty band: 90% confidence band and how it widens over time
- Outcome distribution: terminal return histogram and percentile outcomes
| Parameter | Value |
|---|---|
| Ticker | SPY (S&P 500 ETF) |
| Data source | Tiingo daily adjusted prices |
| Simulation paths | 1,000 bootstrap paths |
| Horizon | 60 months (5 years) |
| Bootstrap seed | 42 (reproducible) |
Requires: TIINGO_API_KEY in /.env.
import os
import numpy as np
import pandas as pd
import requests
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
from pathlib import Path
from dotenv import load_dotenv
#pip install python-dotenv
from IPython.display import display, HTML
FL_BLUE = '#2563eb'
FL_SLATE = '#64748b'
FL_AMBER = '#f59e0b'
FL_GREEN = '#16a34a'
FL_RED = '#ef4444'
FL_BG = '#ffffff'
FL_GRID = '#e2e8f0'
FL_TEXT = '#0f172a'
FL_TEXT2 = '#334155'
FL_BORDER = '#e2e8f0'
matplotlib.rcParams.update({
'figure.facecolor': FL_BG,
'axes.facecolor': FL_BG,
'axes.edgecolor': FL_BORDER,
'axes.labelcolor': FL_TEXT2,
'axes.spines.top': False,
'axes.spines.right': False,
'axes.grid': True,
'grid.color': FL_GRID,
'grid.linewidth': 0.7,
'xtick.color': FL_TEXT2,
'ytick.color': FL_TEXT2,
'xtick.labelsize': 10,
'ytick.labelsize': 10,
'axes.labelsize': 11,
'axes.titlesize': 12,
'axes.titlecolor': FL_TEXT,
'axes.titlepad': 12,
'legend.frameon': False,
'legend.fontsize': 10,
'figure.dpi': 300,
'savefig.bbox': 'tight',
'font.family': 'sans-serif',
'font.sans-serif': ['Inter', 'Helvetica Neue', 'Arial', 'DejaVu Sans'],
})
ENV_PATH = Path.cwd() / '.env'
load_dotenv(ENV_PATH)
TIINGO_API_KEY = os.getenv('TIINGO_API_KEY')
if not TIINGO_API_KEY:
raise RuntimeError(f'TIINGO_API_KEY not found in {ENV_PATH}')
TICKER = 'SPY'
START_DATE = '1993-01-01'
N_SIMS = 1_000
HORIZON_MONTHS = 60
SEED = 42
Helper functions
Tiingo fetch, monthly resampling, bootstrap runner, formatting, and HTML renderers.
def fetch_tiingo(ticker: str, start_date: str) -> pd.DataFrame:
"""Fetch daily adjusted prices from Tiingo."""
resp = requests.get(
f'https://api.tiingo.com/tiingo/daily/{ticker}/prices',
headers={'Authorization': f'Token {TIINGO_API_KEY}', 'Content-Type': 'application/json'},
params={'startDate': start_date, 'resampleFreq': 'daily', 'format': 'json'},
timeout=60,
)
resp.raise_for_status()
raw = pd.DataFrame(resp.json())
if raw.empty:
raise ValueError(f'Tiingo returned no data for {ticker}')
close_col = next((c for c in ['adjClose', 'close', 'adjclose'] if c in raw.columns), None)
if close_col is None:
raise ValueError(f'No close column found. Columns: {list(raw.columns)}')
# Slice to YYYY-MM-DD to avoid tz suffix issues ('1993-01-29T00:00:00.000Z')
df = pd.DataFrame()
df['date'] = pd.to_datetime(raw['date'].str[:10])
df['close'] = pd.to_numeric(raw[close_col].values, errors='coerce')
df = df.set_index('date').sort_index().dropna(subset=['close'])
df['pct_ret'] = df['close'].pct_change()
return df
def daily_to_monthly(df: pd.DataFrame) -> pd.DataFrame:
monthly = df['close'].resample('ME').last().dropna()
monthly_ret = monthly.pct_change().dropna()
return pd.DataFrame({'close': monthly, 'pct_ret': monthly_ret})
def run_bootstrap(monthly_ret: pd.Series, last_price: float) -> tuple:
"""Return (future_idx, p5, p50, p95, sim_paths)."""
start = monthly_ret.index[-1] + pd.offsets.MonthEnd(1)
future_idx = pd.date_range(start=start, periods=HORIZON_MONTHS, freq='ME')
rng = np.random.default_rng(SEED)
sim_rets = rng.choice(monthly_ret.values, size=(HORIZON_MONTHS, N_SIMS), replace=True)
sim_paths = last_price * np.cumprod(1.0 + sim_rets, axis=0)
p5 = np.percentile(sim_paths, 5, axis=1)
p50 = np.percentile(sim_paths, 50, axis=1)
p95 = np.percentile(sim_paths, 95, axis=1)
return future_idx, p5, p50, p95, sim_paths
def fmt_pct(v, d=2, frac=False):
if v is None or (isinstance(v, float) and not np.isfinite(v)): return 'N/A'
return f'{v * 100 if frac else v:.{d}f}%'
def fmt_ratio(v):
if v is None or (isinstance(v, float) and not np.isfinite(v)): return 'N/A'
return f'{v:.2f}'
def render_cards(cards: list) -> str:
items = ''.join(
f'<div style="background:#f8fafc;border:1px solid #e2e8f0;border-radius:8px;'
f'padding:18px 16px;text-align:center;min-width:140px;flex:1 1 140px">'
f'<div style="font-size:20px;font-weight:600;color:#0f172a;'
f'letter-spacing:-0.02em;margin-bottom:6px">{c["value"]}</div>'
f'<div style="font-size:11.5px;color:#64748b;font-weight:500">{c["label"]}</div>'
f'</div>'
for c in cards
)
return f'<div style="display:flex;flex-wrap:wrap;gap:12px;margin:16px 0">{items}</div>'
Data fetch
Daily adjusted prices for SPY are fetched from Tiingo and resampled to month-end closes. The monthly return series is the sampling pool for all bootstrap simulations below.
print(f'Fetching {TICKER} from Tiingo ({START_DATE} --> today)...')
df_daily = fetch_tiingo(TICKER, START_DATE)
df_monthly = daily_to_monthly(df_daily)
monthly_ret = df_monthly['pct_ret'].dropna()
last_price = float(df_monthly['close'].iloc[-1])
start_str = df_daily.index[0].strftime('%Y-%m-%d')
end_str = df_daily.index[-1].strftime('%Y-%m-%d')
n_obs = len(monthly_ret)
print(f'Daily rows : {len(df_daily):,} ({start_str} → {end_str})')
print(f'Monthly rows : {n_obs}')
print(f'Last SPY close: ${last_price:,.2f}')
Fetching SPY from Tiingo (1993-01-01 --> today)... Daily rows : 8,386 (1993-01-29 → 2026-05-22) Monthly rows : 400 Last SPY close: $745.64
Historical data
The S&P 500 has decades of price history across booms, crashes, and recoveries. Rather than sampling prices directly, we transform them into monthly returns using the percentage changes for month to month market moves. This yields a pool of observed returns across different market regimes, which is the raw material for the bootstrap.
prices = df_monthly['close']
plt.figure(figsize=(8, 4.5))
plt.fill_between(prices.index, prices.values, alpha=0.10, color=FL_BLUE)
plt.plot(prices.index, prices.values, color=FL_BLUE, linewidth=1.8)
plt.gca().yaxis.set_major_formatter(
mticker.FuncFormatter(lambda v, _: f'${v:,.0f}')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(4))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title('S&P 500 price history (adjusted close)')
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()
colors = [FL_GREEN if v >= 0 else FL_RED for v in monthly_ret.values]
plt.figure(figsize=(8, 4.5))
plt.bar(
monthly_ret.index,
monthly_ret.values * 100,
color=colors,
alpha=0.75,
width=20
)
plt.axhline(0, color=FL_GRID, linewidth=0.8)
plt.gca().yaxis.set_major_formatter(
mticker.FuncFormatter(lambda v, _: f'{v:.0f}%')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(4))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title('Monthly returns')
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()
print(
f'From {start_str[:4]} to {end_str[:4]}, there are {n_obs} monthly return observations '
f'across different market regimes. This is the sampling pool for the bootstrap.'
)
From 1993 to 2026, there are 400 monthly return observations across different market regimes. This is the sampling pool for the bootstrap.
Return distribution
Stacking all monthly returns into a histogram reveals the shape of the data. Most returns cluster near zero. The tails represent rare but significant months. This distribution is the sampling pool for the bootstrap - we ask what would happen if future returns were random draws from this same pool.
mean_m = float(monthly_ret.mean())
std_m = float(monthly_ret.std(ddof=1))
best_m = float(monthly_ret.max())
worst_m = float(monthly_ret.min())
pct_positive = float((monthly_ret > 0).mean())
display(HTML(render_cards([
{'label': 'Mean monthly return', 'value': fmt_pct(mean_m * 100, 2)},
{'label': 'Std deviation', 'value': fmt_pct(std_m * 100, 2)},
{'label': 'Best month', 'value': fmt_pct(best_m * 100, 1)},
{'label': 'Worst month', 'value': fmt_pct(worst_m * 100, 1)},
{'label': 'Positive months', 'value': fmt_pct(pct_positive * 100, 0)},
{'label': 'Observations', 'value': str(n_obs)},
])))
plt.figure(figsize=(8, 4.5))
plt.hist(
monthly_ret.values * 100,
bins=40,
color=FL_BLUE,
alpha=0.75,
edgecolor='none'
)
plt.axvline(0, color=FL_GRID, linewidth=0.8)
plt.axvline(
mean_m * 100,
color=FL_AMBER,
linewidth=1.2,
linestyle='--',
label=f'Mean ({fmt_pct(mean_m * 100, 2)})'
)
plt.axvline(
worst_m * 100,
color=FL_RED,
linewidth=1.0,
linestyle=':',
label=f'Worst ({fmt_pct(worst_m * 100, 1)})'
)
plt.axvline(
best_m * 100,
color=FL_GREEN,
linewidth=1.0,
linestyle=':',
label=f'Best ({fmt_pct(best_m * 100, 1)})'
)
plt.gca().xaxis.set_major_formatter(
mticker.FuncFormatter(lambda v, _: f'{v:.0f}%')
)
plt.xlabel('Monthly return')
plt.ylabel('Frequency')
plt.title('Distribution of monthly returns for the bootstrap sampling pool')
plt.legend()
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()
Simulation
To build one simulated future, we randomly draw 60 monthly returns from the historical pool and compound them from today's price. Sampling with replacement means any month can appear multiple times. Repeating this 1,000 times produces a cloud of equally plausible 5-year trajectories.
The spread in those paths is the signal: it measures how wide the range of outcomes actually is, without assuming a parametric distribution.
future_idx, p5, p50, p95, sim_paths = run_bootstrap(monthly_ret, last_price)
hist_plot = df_monthly.loc[df_monthly.index >= '2020-01-01', 'close']
plt.figure(figsize=(8, 4.5))
rng_plot = np.random.default_rng(0)
sample_idx = rng_plot.choice(N_SIMS, size=100, replace=False)
for i in sample_idx:
plt.plot(
future_idx,
sim_paths[:, i],
color=FL_BLUE,
alpha=0.05,
linewidth=0.8
)
plt.plot(
hist_plot.index,
hist_plot.values,
color=FL_TEXT,
linewidth=1.8,
label='History'
)
plt.plot(
future_idx,
p50,
color=FL_BLUE,
linewidth=2.0,
label='Median'
)
plt.plot(
future_idx,
p5,
color=FL_RED,
linewidth=1.0,
linestyle='--',
label='5th pct'
)
plt.plot(
future_idx,
p95,
color=FL_GREEN,
linewidth=1.0,
linestyle='--',
label='95th pct'
)
plt.gca().yaxis.set_major_formatter(
mticker.FuncFormatter(lambda v, _: f'${v:,.0f}')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title(f'History since 2020 and {N_SIMS} simulated 5 year paths')
plt.legend()
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()
Uncertainty band
Computing percentiles across all simulations at each point in time condenses the cloud into a readable band. The band widens over time because uncertainty compounds - a mathematical inevitability, not a model limitation. The further out we look, the wider the honest range of plausible outcomes.
plt.figure(figsize=(8, 4.5))
plt.fill_between(
future_idx,
p5,
p95,
alpha=0.15,
color=FL_BLUE,
label='5th to 95th pct (90% band)'
)
plt.fill_between(
future_idx,
p5,
p50,
alpha=0.10,
color=FL_RED
)
plt.plot(
future_idx,
p50,
color=FL_BLUE,
linewidth=2.0,
label='Median'
)
plt.plot(
future_idx,
p5,
color=FL_RED,
linewidth=1.0,
linestyle='--',
label='5th pct'
)
plt.plot(
future_idx,
p95,
color=FL_GREEN,
linewidth=1.0,
linestyle='--',
label='95th pct'
)
plt.axhline(
last_price,
color=FL_GRID,
linewidth=0.9,
linestyle=':',
label=f'Start (${last_price:,.0f})'
)
plt.gca().yaxis.set_major_formatter(
mticker.FuncFormatter(lambda v, _: f'${v:,.0f}')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title('Median path and 90% confidence band')
plt.legend()
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()
def band_width_at(months):
idx = min(months - 1, len(p5) - 1)
low = (p5[idx] / last_price - 1) * 100
high = (p95[idx] / last_price - 1) * 100
return low, high
band_df = pd.DataFrame([
{
'Horizon': '1 year',
'90% band': f'{band_width_at(12)[0]:.0f}% to +{band_width_at(12)[1]:.0f}%'
},
{
'Horizon': '3 years',
'90% band': f'{band_width_at(36)[0]:.0f}% to +{band_width_at(36)[1]:.0f}%'
},
{
'Horizon': '5 years',
'90% band': f'{band_width_at(60)[0]:.0f}% to +{band_width_at(60)[1]:.0f}%'
}
])
display(band_df)
print('The band widens over time because each additional month compounds the uncertainty already in the data.')
| Horizon | 90% band | |
|---|---|---|
| 0 | 1 year | -14% to +40% |
| 1 | 3 years | -10% to +105% |
| 2 | 5 years | -7% to +175% |
The band widens over time because each additional month compounds the uncertainty already in the data.
Outcome distribution
Taking each simulation's ending value and computing total return gives a distribution of 5-year outcomes. The percentile markers translate this into actionable numbers.
Bootstrap does not predict. It attaches probabilities to a range of plausible outcomes grounded in historical behavior, without assuming a parametric distribution. The shape of this histogram is the honest answer to "what might happen over 5 years?"
final_values = sim_paths[-1, :]
final_returns = final_values / last_price - 1.0
p5_ret = float(np.percentile(final_returns, 5))
p25_ret = float(np.percentile(final_returns, 25))
p50_ret = float(np.percentile(final_returns, 50))
p75_ret = float(np.percentile(final_returns, 75))
p95_ret = float(np.percentile(final_returns, 95))
p_loss = float((final_returns < 0).mean())
cagr_med = float((p50[-1] / last_price) ** (1 / 5) - 1)
display(HTML(render_cards([
{'label': '5th percentile', 'value': fmt_pct(p5_ret * 100, 1)},
{'label': '25th percentile', 'value': fmt_pct(p25_ret * 100, 1)},
{'label': 'Median (50th)', 'value': fmt_pct(p50_ret * 100, 1)},
{'label': '75th percentile', 'value': fmt_pct(p75_ret * 100, 1)},
{'label': '95th percentile', 'value': fmt_pct(p95_ret * 100, 1)},
{'label': 'Median CAGR', 'value': fmt_pct(cagr_med * 100, 1)},
{'label': 'Probability of loss','value': fmt_pct(p_loss * 100, 1)},
])))
plt.figure(figsize=(8, 4.5))
plt.hist(
final_returns * 100,
bins=50,
color=FL_BLUE,
alpha=0.75,
edgecolor='none'
)
plt.axvline(
0,
color=FL_RED,
linewidth=1.2,
linestyle='--',
label='Break-even (0%)'
)
plt.axvline(
p5_ret * 100,
color=FL_RED,
linewidth=0.9,
linestyle=':',
label=f'5th pct ({fmt_pct(p5_ret * 100, 1)})'
)
plt.axvline(
p50_ret * 100,
color=FL_AMBER,
linewidth=1.2,
linestyle='--',
label=f'Median ({fmt_pct(p50_ret * 100, 1)})'
)
plt.axvline(
p95_ret * 100,
color=FL_GREEN,
linewidth=0.9,
linestyle=':',
label=f'95th pct ({fmt_pct(p95_ret * 100, 1)})'
)
plt.gca().xaxis.set_major_formatter(
mticker.FuncFormatter(lambda v, _: f'{v:.0f}%')
)
plt.xlabel('5-year cumulative return')
plt.ylabel('Simulations')
plt.title(f'Distribution of 5 year outcomes across {N_SIMS} simulations')
plt.legend(fontsize=9)
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()
outcome_df = pd.DataFrame([
{
'Metric': 'Starting price',
'Value': f'${last_price:,.2f}'
},
{
'Metric': 'Median 5 year return',
'Value': fmt_pct(p50_ret * 100, 1)
},
{
'Metric': 'Median CAGR',
'Value': fmt_pct(cagr_med * 100, 1)
},
{
'Metric': '5th percentile return',
'Value': fmt_pct(p5_ret * 100, 1)
},
{
'Metric': '95th percentile return',
'Value': fmt_pct(p95_ret * 100, 1)
},
{
'Metric': 'Probability of negative 5 year return',
'Value': fmt_pct(p_loss * 100, 1)
}
])
display(outcome_df)
| Metric | Value | |
|---|---|---|
| 0 | Starting price | $745.64 |
| 1 | Median 5 year return | 67.4% |
| 2 | Median CAGR | 10.8% |
| 3 | 5th percentile return | -7.2% |
| 4 | 95th percentile return | 175.3% |
| 5 | Probability of negative 5 year return | 6.8% |