Random Forest Regression Model Reference

This notebook is a practical reference for Random Forest regression.

It shows:

how to load a small public regression dataset
how to train a Random Forest baseline
what the main parameters mean
how to evaluate predictions
how to inspect feature importance
how to save outputs

Install packages

In [ ]:

pip install pandas numpy scikit-learn matplotlib

Create folders

In [1]:

from pathlib import Path

data_dir = Path("data")
output_dir = Path("outputs")

data_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)

# print(data_dir.resolve())
# print(output_dir.resolve())

Load a small public dataset

In [2]:

import pandas as pd
from sklearn.datasets import load_diabetes

dataset = load_diabetes(as_frame=True)

df = dataset.frame.copy()
df.to_csv(data_dir / "diabetes_regression_sample.csv", index=False)

print(df.shape)
df.head()

(442, 11)

Out[2]:

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	target
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019907	-0.017646	151.0
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068332	-0.092204	75.0
2	0.085299	0.050680	0.044451	-0.005670	-0.045599	-0.034194	-0.032356	-0.002592	0.002861	-0.025930	141.0
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022688	-0.009362	206.0
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031988	-0.046641	135.0

Prepare train and validation data

In [3]:

from sklearn.model_selection import train_test_split

X = df.drop(columns=["target"])
y = df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42
)

print(X_train.shape)
print(X_valid.shape)

(331, 10)
(111, 10)

Train a Random Forest regressor

Random Forest builds many decision trees independently and averages their predictions.

In [4]:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=5,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

pred = rf.predict(X_valid)

pred[:5]

Out[4]:

array([147.99949333, 176.73596358, 149.98966143, 245.84321372,
       118.44289185])

Evaluate the model

In [5]:

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

metrics = pd.DataFrame([
    {
        "model": "Random Forest",
        "n_estimators": 300,
        "max_depth": 5,
        "min_samples_leaf": 1,
        "mae": mean_absolute_error(y_valid, pred),
        "rmse": np.sqrt(mean_squared_error(y_valid, pred)),
        "r2": r2_score(y_valid, pred)
    }
])

metrics

Out[5]:

	model	n_estimators	max_depth	min_samples_leaf	mae	rmse	r2
0	Random Forest	300	5	1	42.191033	52.969084	0.492607

Inspect feature importance

Random Forest can report impurity-based feature importance. Treat it as a quick diagnostic, not as a complete causal explanation.

In [6]:

feature_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

feature_importance

Out[6]:

	feature	importance
2	bmi	0.403142
8	s5	0.250489
3	bp	0.103107
9	s6	0.052068
0	age	0.041128
6	s3	0.040162
5	s2	0.039256
4	s1	0.036932
7	s4	0.027597
1	sex	0.006120

Plot feature importance

In [7]:

feature_importance.sort_values("importance", ascending=True)

Out[7]:

	feature	importance
1	sex	0.006120
7	s4	0.027597
4	s1	0.036932
5	s2	0.039256
6	s3	0.040162
0	age	0.041128
9	s6	0.052068
3	bp	0.103107
8	s5	0.250489
2	bmi	0.403142

Compare depth settings

In [8]:

depth_results = []

for depth in [2, 3, 5, 8, None]:
    model = RandomForestRegressor(
        n_estimators=300,
        max_depth=depth,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    model_pred = model.predict(X_valid)

    depth_results.append({
        "max_depth": str(depth),
        "mae": mean_absolute_error(y_valid, model_pred),
        "rmse": np.sqrt(mean_squared_error(y_valid, model_pred)),
        "r2": r2_score(y_valid, model_pred)
    })

depth_comparison = pd.DataFrame(depth_results)
depth_comparison

Out[8]:

	max_depth	mae	rmse	r2
0	2	43.480285	53.808550	0.476397
1	3	41.925885	52.445176	0.502595
2	5	42.191033	52.969084	0.492607
3	8	42.949263	53.643369	0.479607
4	None	43.096336	53.925084	0.474127

from pathlib import Path data_dir = Path("data") output_dir = Path("outputs") data_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True) # print(data_dir.resolve()) # print(output_dir.resolve())

import pandas as pd from sklearn.datasets import load_diabetes dataset = load_diabetes(as_frame=True) df = dataset.frame.copy() df.to_csv(data_dir / "diabetes_regression_sample.csv", index=False) print(df.shape) df.head()

age

sex

bmi

target

0.038076

0.050680

0.061696

0.021872

-0.044223

-0.034821

-0.043401

-0.002592

0.019907

-0.017646

151.0

-0.001882

-0.044642

-0.051474

-0.026328

-0.008449

-0.019163

0.074412

-0.039493

-0.068332

-0.092204

75.0

0.085299

0.050680

0.044451

-0.005670

-0.045599

-0.034194

-0.032356

-0.002592

0.002861

-0.025930

141.0

-0.089063

-0.044642

-0.011595

-0.036656

0.012191

0.024991

-0.036038

0.034309

0.022688

-0.009362

206.0

0.005383

-0.044642

-0.036385

0.021872

0.003935

0.015596

0.008142

-0.002592

-0.031988

-0.046641

135.0

from sklearn.model_selection import train_test_split X = df.drop(columns=["target"]) y = df["target"] X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.25, random_state=42 ) print(X_train.shape) print(X_valid.shape)

from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor( n_estimators=300, max_depth=5, min_samples_leaf=1, random_state=42, n_jobs=-1 ) rf.fit(X_train, y_train) pred = rf.predict(X_valid) pred[:5]

import numpy as np import pandas as pd from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score metrics = pd.DataFrame([ { "model": "Random Forest", "n_estimators": 300, "max_depth": 5, "min_samples_leaf": 1, "mae": mean_absolute_error(y_valid, pred), "rmse": np.sqrt(mean_squared_error(y_valid, pred)), "r2": r2_score(y_valid, pred) } ]) metrics

model

n_estimators

max_depth

min_samples_leaf

mae

rmse

Random Forest

300

42.191033

52.969084

0.492607

feature

importance

bmi

0.403142

0.250489

0.103107

0.052068

age

0.041128

0.040162

0.039256

0.036932

0.027597

sex

0.006120

feature

importance

sex

0.006120

0.027597

0.036932

0.039256

0.040162

age

0.041128

0.052068

0.103107

0.250489

bmi

0.403142

depth_results = [] for depth in [2, 3, 5, 8, None]: model = RandomForestRegressor( n_estimators=300, max_depth=depth, min_samples_leaf=1, random_state=42, n_jobs=-1 ) model.fit(X_train, y_train) model_pred = model.predict(X_valid) depth_results.append({ "max_depth": str(depth), "mae": mean_absolute_error(y_valid, model_pred), "rmse": np.sqrt(mean_squared_error(y_valid, model_pred)), "r2": r2_score(y_valid, model_pred) }) depth_comparison = pd.DataFrame(depth_results) depth_comparison

max_depth

mae

rmse

43.480285

53.808550

0.476397

41.925885

52.445176

0.502595

42.191033

52.969084

0.492607

42.949263

53.643369

0.479607

None

43.096336

53.925084

0.474127