Bootstrap Resampling

A pedagogical walkthrough of bootstrap resampling using real S&P 500 data.

The analysis proceeds in five steps:

Historical data: SPY price history and monthly returns
Return distribution: histogram of monthly returns and summary statistics
Simulation: 1,000 bootstrap paths over 5 years
Uncertainty band: 90% confidence band and how it widens over time
Outcome distribution: terminal return histogram and percentile outcomes

Parameter	Value
Ticker	SPY (S&P 500 ETF)
Data source	Tiingo daily adjusted prices
Simulation paths	1,000 bootstrap paths
Horizon	60 months (5 years)
Bootstrap seed	42 (reproducible)

Requires: TIINGO_API_KEY in /.env.

In [1]:

import os
import numpy as np
import pandas as pd
import requests
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker

from pathlib import Path
from dotenv import load_dotenv
#pip install python-dotenv
from IPython.display import display, HTML

FL_BLUE   = '#2563eb'
FL_SLATE  = '#64748b'
FL_AMBER  = '#f59e0b'
FL_GREEN  = '#16a34a'
FL_RED    = '#ef4444'
FL_BG     = '#ffffff'
FL_GRID   = '#e2e8f0'
FL_TEXT   = '#0f172a'
FL_TEXT2  = '#334155'
FL_BORDER = '#e2e8f0'

matplotlib.rcParams.update({
    'figure.facecolor':  FL_BG,
    'axes.facecolor':    FL_BG,
    'axes.edgecolor':    FL_BORDER,
    'axes.labelcolor':   FL_TEXT2,
    'axes.spines.top':   False,
    'axes.spines.right': False,
    'axes.grid':         True,
    'grid.color':        FL_GRID,
    'grid.linewidth':    0.7,
    'xtick.color':       FL_TEXT2,
    'ytick.color':       FL_TEXT2,
    'xtick.labelsize':   10,
    'ytick.labelsize':   10,
    'axes.labelsize':    11,
    'axes.titlesize':    12,
    'axes.titlecolor':   FL_TEXT,
    'axes.titlepad':     12,
    'legend.frameon':    False,
    'legend.fontsize':   10,
    'figure.dpi':        300,
    'savefig.bbox':      'tight',
    'font.family':       'sans-serif',
    'font.sans-serif':   ['Inter', 'Helvetica Neue', 'Arial', 'DejaVu Sans'],
})

ENV_PATH = Path.cwd() / '.env'
load_dotenv(ENV_PATH)

TIINGO_API_KEY = os.getenv('TIINGO_API_KEY')
if not TIINGO_API_KEY:
    raise RuntimeError(f'TIINGO_API_KEY not found in {ENV_PATH}')

TICKER          = 'SPY'
START_DATE      = '1993-01-01'
N_SIMS          = 1_000
HORIZON_MONTHS  = 60
SEED            = 42

Helper functions

Tiingo fetch, monthly resampling, bootstrap runner, formatting, and HTML renderers.

In [2]:

def fetch_tiingo(ticker: str, start_date: str) -> pd.DataFrame:
    """Fetch daily adjusted prices from Tiingo."""
    resp = requests.get(
        f'https://api.tiingo.com/tiingo/daily/{ticker}/prices',
        headers={'Authorization': f'Token {TIINGO_API_KEY}', 'Content-Type': 'application/json'},
        params={'startDate': start_date, 'resampleFreq': 'daily', 'format': 'json'},
        timeout=60,
    )
    resp.raise_for_status()
    raw = pd.DataFrame(resp.json())
    if raw.empty:
        raise ValueError(f'Tiingo returned no data for {ticker}')

    close_col = next((c for c in ['adjClose', 'close', 'adjclose'] if c in raw.columns), None)
    if close_col is None:
        raise ValueError(f'No close column found. Columns: {list(raw.columns)}')

    # Slice to YYYY-MM-DD to avoid tz suffix issues ('1993-01-29T00:00:00.000Z')
    df = pd.DataFrame()
    df['date']  = pd.to_datetime(raw['date'].str[:10])
    df['close'] = pd.to_numeric(raw[close_col].values, errors='coerce')
    df = df.set_index('date').sort_index().dropna(subset=['close'])
    df['pct_ret'] = df['close'].pct_change()
    return df


def daily_to_monthly(df: pd.DataFrame) -> pd.DataFrame:
    monthly = df['close'].resample('ME').last().dropna()
    monthly_ret = monthly.pct_change().dropna()
    return pd.DataFrame({'close': monthly, 'pct_ret': monthly_ret})


def run_bootstrap(monthly_ret: pd.Series, last_price: float) -> tuple:
    """Return (future_idx, p5, p50, p95, sim_paths)."""
    start = monthly_ret.index[-1] + pd.offsets.MonthEnd(1)
    future_idx = pd.date_range(start=start, periods=HORIZON_MONTHS, freq='ME')
    rng = np.random.default_rng(SEED)
    sim_rets  = rng.choice(monthly_ret.values, size=(HORIZON_MONTHS, N_SIMS), replace=True)
    sim_paths = last_price * np.cumprod(1.0 + sim_rets, axis=0)
    p5  = np.percentile(sim_paths,  5, axis=1)
    p50 = np.percentile(sim_paths, 50, axis=1)
    p95 = np.percentile(sim_paths, 95, axis=1)
    return future_idx, p5, p50, p95, sim_paths


def fmt_pct(v, d=2, frac=False):
    if v is None or (isinstance(v, float) and not np.isfinite(v)): return 'N/A'
    return f'{v * 100 if frac else v:.{d}f}%'

def fmt_ratio(v):
    if v is None or (isinstance(v, float) and not np.isfinite(v)): return 'N/A'
    return f'{v:.2f}'


def render_cards(cards: list) -> str:
    items = ''.join(
        f'<div style="background:#f8fafc;border:1px solid #e2e8f0;border-radius:8px;'
        f'padding:18px 16px;text-align:center;min-width:140px;flex:1 1 140px">'
        f'<div style="font-size:20px;font-weight:600;color:#0f172a;'
        f'letter-spacing:-0.02em;margin-bottom:6px">{c["value"]}</div>'
        f'<div style="font-size:11.5px;color:#64748b;font-weight:500">{c["label"]}</div>'
        f'</div>'
        for c in cards
    )
    return f'<div style="display:flex;flex-wrap:wrap;gap:12px;margin:16px 0">{items}</div>'

Data fetch

Daily adjusted prices for SPY are fetched from Tiingo and resampled to month-end closes. The monthly return series is the sampling pool for all bootstrap simulations below.

In [5]:

print(f'Fetching {TICKER} from Tiingo ({START_DATE} --> today)...')
df_daily   = fetch_tiingo(TICKER, START_DATE)
df_monthly = daily_to_monthly(df_daily)
monthly_ret = df_monthly['pct_ret'].dropna()
last_price  = float(df_monthly['close'].iloc[-1])

start_str = df_daily.index[0].strftime('%Y-%m-%d')
end_str   = df_daily.index[-1].strftime('%Y-%m-%d')
n_obs     = len(monthly_ret)

print(f'Daily rows   : {len(df_daily):,}  ({start_str} → {end_str})')
print(f'Monthly rows : {n_obs}')
print(f'Last SPY close: ${last_price:,.2f}')

Fetching SPY from Tiingo (1993-01-01 --> today)...
Daily rows   : 8,386  (1993-01-29 → 2026-05-22)
Monthly rows : 400
Last SPY close: $745.64

Historical data

The S&P 500 has decades of price history across booms, crashes, and recoveries. Rather than sampling prices directly, we transform them into monthly returns using the percentage changes for month to month market moves. This yields a pool of observed returns across different market regimes, which is the raw material for the bootstrap.

In [6]:

prices = df_monthly['close']

plt.figure(figsize=(8, 4.5))
plt.fill_between(prices.index, prices.values, alpha=0.10, color=FL_BLUE)
plt.plot(prices.index, prices.values, color=FL_BLUE, linewidth=1.8)
plt.gca().yaxis.set_major_formatter(
    mticker.FuncFormatter(lambda v, _: f'${v:,.0f}')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(4))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title('S&P 500 price history (adjusted close)')
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

colors = [FL_GREEN if v >= 0 else FL_RED for v in monthly_ret.values]

plt.figure(figsize=(8, 4.5))
plt.bar(
    monthly_ret.index,
    monthly_ret.values * 100,
    color=colors,
    alpha=0.75,
    width=20
)
plt.axhline(0, color=FL_GRID, linewidth=0.8)
plt.gca().yaxis.set_major_formatter(
    mticker.FuncFormatter(lambda v, _: f'{v:.0f}%')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(4))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title('Monthly returns')
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

print(
    f'From {start_str[:4]} to {end_str[:4]}, there are {n_obs} monthly return observations '
    f'across different market regimes. This is the sampling pool for the bootstrap.'
)

No description has been provided for this image

From 1993 to 2026, there are 400 monthly return observations across different market regimes. This is the sampling pool for the bootstrap.

Return distribution

Stacking all monthly returns into a histogram reveals the shape of the data. Most returns cluster near zero. The tails represent rare but significant months. This distribution is the sampling pool for the bootstrap - we ask what would happen if future returns were random draws from this same pool.

In [7]:

mean_m       = float(monthly_ret.mean())
std_m        = float(monthly_ret.std(ddof=1))
best_m       = float(monthly_ret.max())
worst_m      = float(monthly_ret.min())
pct_positive = float((monthly_ret > 0).mean())

display(HTML(render_cards([
    {'label': 'Mean monthly return', 'value': fmt_pct(mean_m  * 100, 2)},
    {'label': 'Std deviation',       'value': fmt_pct(std_m   * 100, 2)},
    {'label': 'Best month',          'value': fmt_pct(best_m  * 100, 1)},
    {'label': 'Worst month',         'value': fmt_pct(worst_m * 100, 1)},
    {'label': 'Positive months',     'value': fmt_pct(pct_positive * 100, 0)},
    {'label': 'Observations',        'value': str(n_obs)},
])))

0.95%

Mean monthly return

4.27%

Std deviation

12.7%

Best month

-16.5%

Worst month

65%

Positive months

400

Observations

In [8]:

plt.figure(figsize=(8, 4.5))
plt.hist(
    monthly_ret.values * 100,
    bins=40,
    color=FL_BLUE,
    alpha=0.75,
    edgecolor='none'
)
plt.axvline(0, color=FL_GRID, linewidth=0.8)
plt.axvline(
    mean_m * 100,
    color=FL_AMBER,
    linewidth=1.2,
    linestyle='--',
    label=f'Mean ({fmt_pct(mean_m * 100, 2)})'
)
plt.axvline(
    worst_m * 100,
    color=FL_RED,
    linewidth=1.0,
    linestyle=':',
    label=f'Worst ({fmt_pct(worst_m * 100, 1)})'
)
plt.axvline(
    best_m * 100,
    color=FL_GREEN,
    linewidth=1.0,
    linestyle=':',
    label=f'Best ({fmt_pct(best_m * 100, 1)})'
)
plt.gca().xaxis.set_major_formatter(
    mticker.FuncFormatter(lambda v, _: f'{v:.0f}%')
)
plt.xlabel('Monthly return')
plt.ylabel('Frequency')
plt.title('Distribution of monthly returns for the bootstrap sampling pool')
plt.legend()
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

Simulation

To build one simulated future, we randomly draw 60 monthly returns from the historical pool and compound them from today's price. Sampling with replacement means any month can appear multiple times. Repeating this 1,000 times produces a cloud of equally plausible 5-year trajectories.

The spread in those paths is the signal: it measures how wide the range of outcomes actually is, without assuming a parametric distribution.

In [9]:

future_idx, p5, p50, p95, sim_paths = run_bootstrap(monthly_ret, last_price)

hist_plot = df_monthly.loc[df_monthly.index >= '2020-01-01', 'close']

plt.figure(figsize=(8, 4.5))

rng_plot = np.random.default_rng(0)
sample_idx = rng_plot.choice(N_SIMS, size=100, replace=False)

for i in sample_idx:
    plt.plot(
        future_idx,
        sim_paths[:, i],
        color=FL_BLUE,
        alpha=0.05,
        linewidth=0.8
    )

plt.plot(
    hist_plot.index,
    hist_plot.values,
    color=FL_TEXT,
    linewidth=1.8,
    label='History'
)
plt.plot(
    future_idx,
    p50,
    color=FL_BLUE,
    linewidth=2.0,
    label='Median'
)
plt.plot(
    future_idx,
    p5,
    color=FL_RED,
    linewidth=1.0,
    linestyle='--',
    label='5th pct'
)
plt.plot(
    future_idx,
    p95,
    color=FL_GREEN,
    linewidth=1.0,
    linestyle='--',
    label='95th pct'
)

plt.gca().yaxis.set_major_formatter(
    mticker.FuncFormatter(lambda v, _: f'${v:,.0f}')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title(f'History since 2020 and {N_SIMS} simulated 5 year paths')
plt.legend()
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

Uncertainty band

Computing percentiles across all simulations at each point in time condenses the cloud into a readable band. The band widens over time because uncertainty compounds - a mathematical inevitability, not a model limitation. The further out we look, the wider the honest range of plausible outcomes.

In [10]:

plt.figure(figsize=(8, 4.5))

plt.fill_between(
    future_idx,
    p5,
    p95,
    alpha=0.15,
    color=FL_BLUE,
    label='5th to 95th pct (90% band)'
)
plt.fill_between(
    future_idx,
    p5,
    p50,
    alpha=0.10,
    color=FL_RED
)
plt.plot(
    future_idx,
    p50,
    color=FL_BLUE,
    linewidth=2.0,
    label='Median'
)
plt.plot(
    future_idx,
    p5,
    color=FL_RED,
    linewidth=1.0,
    linestyle='--',
    label='5th pct'
)
plt.plot(
    future_idx,
    p95,
    color=FL_GREEN,
    linewidth=1.0,
    linestyle='--',
    label='95th pct'
)
plt.axhline(
    last_price,
    color=FL_GRID,
    linewidth=0.9,
    linestyle=':',
    label=f'Start (${last_price:,.0f})'
)

plt.gca().yaxis.set_major_formatter(
    mticker.FuncFormatter(lambda v, _: f'${v:,.0f}')
)
plt.gca().xaxis.set_major_locator(mdates.YearLocator(1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.title('Median path and 90% confidence band')
plt.legend()
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

def band_width_at(months):
    idx = min(months - 1, len(p5) - 1)
    low = (p5[idx] / last_price - 1) * 100
    high = (p95[idx] / last_price - 1) * 100
    return low, high

band_df = pd.DataFrame([
    {
        'Horizon': '1 year',
        '90% band': f'{band_width_at(12)[0]:.0f}% to +{band_width_at(12)[1]:.0f}%'
    },
    {
        'Horizon': '3 years',
        '90% band': f'{band_width_at(36)[0]:.0f}% to +{band_width_at(36)[1]:.0f}%'
    },
    {
        'Horizon': '5 years',
        '90% band': f'{band_width_at(60)[0]:.0f}% to +{band_width_at(60)[1]:.0f}%'
    }
])

display(band_df)

print('The band widens over time because each additional month compounds the uncertainty already in the data.')

	Horizon	90% band
0	1 year	-14% to +40%
1	3 years	-10% to +105%
2	5 years	-7% to +175%

The band widens over time because each additional month compounds the uncertainty already in the data.

Outcome distribution

Taking each simulation's ending value and computing total return gives a distribution of 5-year outcomes. The percentile markers translate this into actionable numbers.

Bootstrap does not predict. It attaches probabilities to a range of plausible outcomes grounded in historical behavior, without assuming a parametric distribution. The shape of this histogram is the honest answer to "what might happen over 5 years?"

In [11]:

final_values  = sim_paths[-1, :]
final_returns = final_values / last_price - 1.0

p5_ret  = float(np.percentile(final_returns,  5))
p25_ret = float(np.percentile(final_returns, 25))
p50_ret = float(np.percentile(final_returns, 50))
p75_ret = float(np.percentile(final_returns, 75))
p95_ret = float(np.percentile(final_returns, 95))
p_loss  = float((final_returns < 0).mean())
cagr_med = float((p50[-1] / last_price) ** (1 / 5) - 1)

display(HTML(render_cards([
    {'label': '5th percentile',    'value': fmt_pct(p5_ret  * 100, 1)},
    {'label': '25th percentile',   'value': fmt_pct(p25_ret * 100, 1)},
    {'label': 'Median (50th)',     'value': fmt_pct(p50_ret * 100, 1)},
    {'label': '75th percentile',   'value': fmt_pct(p75_ret * 100, 1)},
    {'label': '95th percentile',   'value': fmt_pct(p95_ret * 100, 1)},
    {'label': 'Median CAGR',       'value': fmt_pct(cagr_med * 100, 1)},
    {'label': 'Probability of loss','value': fmt_pct(p_loss  * 100, 1)},
])))

-7.2%

5th percentile

34.6%

25th percentile

67.4%

Median (50th)

108.7%

75th percentile

175.3%

95th percentile

10.8%

Median CAGR

6.8%

Probability of loss

In [12]:

plt.figure(figsize=(8, 4.5))
plt.hist(
    final_returns * 100,
    bins=50,
    color=FL_BLUE,
    alpha=0.75,
    edgecolor='none'
)
plt.axvline(
    0,
    color=FL_RED,
    linewidth=1.2,
    linestyle='--',
    label='Break-even (0%)'
)
plt.axvline(
    p5_ret * 100,
    color=FL_RED,
    linewidth=0.9,
    linestyle=':',
    label=f'5th pct ({fmt_pct(p5_ret * 100, 1)})'
)
plt.axvline(
    p50_ret * 100,
    color=FL_AMBER,
    linewidth=1.2,
    linestyle='--',
    label=f'Median ({fmt_pct(p50_ret * 100, 1)})'
)
plt.axvline(
    p95_ret * 100,
    color=FL_GREEN,
    linewidth=0.9,
    linestyle=':',
    label=f'95th pct ({fmt_pct(p95_ret * 100, 1)})'
)

plt.gca().xaxis.set_major_formatter(
    mticker.FuncFormatter(lambda v, _: f'{v:.0f}%')
)
plt.xlabel('5-year cumulative return')
plt.ylabel('Simulations')
plt.title(f'Distribution of 5 year outcomes across {N_SIMS} simulations')
plt.legend(fontsize=9)
plt.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
plt.show()

outcome_df = pd.DataFrame([
    {
        'Metric': 'Starting price',
        'Value': f'${last_price:,.2f}'
    },
    {
        'Metric': 'Median 5 year return',
        'Value': fmt_pct(p50_ret * 100, 1)
    },
    {
        'Metric': 'Median CAGR',
        'Value': fmt_pct(cagr_med * 100, 1)
    },
    {
        'Metric': '5th percentile return',
        'Value': fmt_pct(p5_ret * 100, 1)
    },
    {
        'Metric': '95th percentile return',
        'Value': fmt_pct(p95_ret * 100, 1)
    },
    {
        'Metric': 'Probability of negative 5 year return',
        'Value': fmt_pct(p_loss * 100, 1)
    }
])

display(outcome_df)

	Metric	Value
0	Starting price	$745.64
1	Median 5 year return	67.4%
2	Median CAGR	10.8%
3	5th percentile return	-7.2%
4	95th percentile return	175.3%
5	Probability of negative 5 year return	6.8%