XGBoost Regression Model Reference

This notebook is a practical reference for XGBoost regression.

It shows:

how to load a small public regression dataset
how boosted trees differ from Random Forest
how to train XGBRegressor
how to evaluate predictions
how to inspect feature importance
how to save outputs

Install packages

In [10]:

# pip install pandas numpy scikit-learn matplotlib xgboost

Create folders

In [2]:

from pathlib import Path

data_dir = Path("data")
output_dir = Path("outputs")

data_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)

# print(data_dir.resolve())
# print(output_dir.resolve())

Load a small public dataset

In [3]:

import pandas as pd
from sklearn.datasets import load_diabetes

dataset = load_diabetes(as_frame=True)

df = dataset.frame.copy()
df.to_csv(data_dir / "diabetes_regression_sample.csv", index=False)

print(df.shape)
df.head()

(442, 11)

Out[3]:

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	target
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019907	-0.017646	151.0
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068332	-0.092204	75.0
2	0.085299	0.050680	0.044451	-0.005670	-0.045599	-0.034194	-0.032356	-0.002592	0.002861	-0.025930	141.0
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022688	-0.009362	206.0
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031988	-0.046641	135.0

Prepare train and validation data

In [4]:

from sklearn.model_selection import train_test_split

X = df.drop(columns=["target"])
y = df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42
)

print(X_train.shape)
print(X_valid.shape)

(331, 10)
(111, 10)

Train an XGBoost regressor

XGBoost builds trees sequentially. Each new tree tries to correct errors left by the previous trees.

In [5]:

from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
    objective="reg:squarederror"
)

xgb.fit(X_train, y_train)

pred = xgb.predict(X_valid)

pred[:5]

Out[5]:

array([152.07732, 201.66719, 159.84453, 278.6145 , 125.21937],
      dtype=float32)

Evaluate the model

In [6]:

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

metrics = pd.DataFrame([
    {
        "model": "XGBoost",
        "n_estimators": 300,
        "max_depth": 3,
        "learning_rate": 0.03,
        "mae": mean_absolute_error(y_valid, pred),
        "rmse": np.sqrt(mean_squared_error(y_valid, pred)),
        "r2": r2_score(y_valid, pred)
    }
])

metrics

Out[6]:

	model	n_estimators	max_depth	learning_rate	mae	rmse	r2
0	XGBoost	300	3	0.03	43.667246	55.508984	0.442781

Compare learning rates

In [7]:

rate_results = []

for rate in [0.01, 0.03, 0.05, 0.10]:
    model = XGBRegressor(
        n_estimators=300,
        max_depth=3,
        learning_rate=rate,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        tree_method="hist",
        objective="reg:squarederror"
    )

    model.fit(X_train, y_train)
    model_pred = model.predict(X_valid)

    rate_results.append({
        "learning_rate": rate,
        "mae": mean_absolute_error(y_valid, model_pred),
        "rmse": np.sqrt(mean_squared_error(y_valid, model_pred)),
        "r2": r2_score(y_valid, model_pred)
    })

learning_rate_comparison = pd.DataFrame(rate_results)
learning_rate_comparison

Out[7]:

	learning_rate	mae	rmse	r2
0	0.01	42.759235	53.331663	0.485637
1	0.03	43.667246	55.508984	0.442781
2	0.05	45.044640	57.439444	0.403350
3	0.10	45.636429	58.078716	0.389995

Inspect feature importance

In [8]:

feature_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": xgb.feature_importances_
}).sort_values("importance", ascending=False)

feature_importance

Out[8]:

	feature	importance
2	bmi	0.268786
8	s5	0.189291
3	bp	0.097130
7	s4	0.079540
6	s3	0.075476
9	s6	0.070914
4	s1	0.066521
0	age	0.051980
5	s2	0.050685
1	sex	0.049677

Feature importance

In [9]:

feature_importance.sort_values("importance", ascending=True)

Out[9]:

	feature	importance
1	sex	0.049677
5	s2	0.050685
0	age	0.051980
4	s1	0.066521
9	s6	0.070914
6	s3	0.075476
7	s4	0.079540
3	bp	0.097130
8	s5	0.189291
2	bmi	0.268786

from pathlib import Path data_dir = Path("data") output_dir = Path("outputs") data_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True) # print(data_dir.resolve()) # print(output_dir.resolve())

import pandas as pd from sklearn.datasets import load_diabetes dataset = load_diabetes(as_frame=True) df = dataset.frame.copy() df.to_csv(data_dir / "diabetes_regression_sample.csv", index=False) print(df.shape) df.head()

age

sex

bmi

target

0.038076

0.050680

0.061696

0.021872

-0.044223

-0.034821

-0.043401

-0.002592

0.019907

-0.017646

151.0

-0.001882

-0.044642

-0.051474

-0.026328

-0.008449

-0.019163

0.074412

-0.039493

-0.068332

-0.092204

75.0

0.085299

0.050680

0.044451

-0.005670

-0.045599

-0.034194

-0.032356

-0.002592

0.002861

-0.025930

141.0

-0.089063

-0.044642

-0.011595

-0.036656

0.012191

0.024991

-0.036038

0.034309

0.022688

-0.009362

206.0

0.005383

-0.044642

-0.036385

0.021872

0.003935

0.015596

0.008142

-0.002592

-0.031988

-0.046641

135.0

from sklearn.model_selection import train_test_split X = df.drop(columns=["target"]) y = df["target"] X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.25, random_state=42 ) print(X_train.shape) print(X_valid.shape)

from xgboost import XGBRegressor xgb = XGBRegressor( n_estimators=300, max_depth=3, learning_rate=0.03, subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0, random_state=42, n_jobs=-1, tree_method="hist", objective="reg:squarederror" ) xgb.fit(X_train, y_train) pred = xgb.predict(X_valid) pred[:5]

import numpy as np import pandas as pd from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score metrics = pd.DataFrame([ { "model": "XGBoost", "n_estimators": 300, "max_depth": 3, "learning_rate": 0.03, "mae": mean_absolute_error(y_valid, pred), "rmse": np.sqrt(mean_squared_error(y_valid, pred)), "r2": r2_score(y_valid, pred) } ]) metrics

model

n_estimators

max_depth

learning_rate

mae

rmse

XGBoost

300

0.03

43.667246

55.508984

0.442781

rate_results = [] for rate in [0.01, 0.03, 0.05, 0.10]: model = XGBRegressor( n_estimators=300, max_depth=3, learning_rate=rate, subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0, random_state=42, n_jobs=-1, tree_method="hist", objective="reg:squarederror" ) model.fit(X_train, y_train) model_pred = model.predict(X_valid) rate_results.append({ "learning_rate": rate, "mae": mean_absolute_error(y_valid, model_pred), "rmse": np.sqrt(mean_squared_error(y_valid, model_pred)), "r2": r2_score(y_valid, model_pred) }) learning_rate_comparison = pd.DataFrame(rate_results) learning_rate_comparison

learning_rate

mae

rmse

0.01

42.759235

53.331663

0.485637

0.03

43.667246

55.508984

0.442781

0.05

45.044640

57.439444

0.403350

0.10

45.636429

58.078716

0.389995

feature

importance

bmi

0.268786

0.189291

0.097130

0.079540

0.075476

0.070914

0.066521

age

0.051980

0.050685

sex

0.049677

feature

importance

sex

0.049677

0.050685

age

0.051980

0.066521

0.070914

0.075476

0.079540

0.097130

0.189291

bmi

0.268786