Correlated Variables Notebook

Author

Prof. Tiffany Tang

Published

May 8, 2026

Show Code
import copy
import os
from os.path import join as oj
import pickle as pkl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from plotnine import *

import sys
sys.path.append("..")
from python.simulate_data import generate_block_covariance_matrix
from python.utils import get_feature_importances, get_coefficients, fit_shap

np.random.seed(12345)

RESULTS_DIR = oj("..", "results")
# Create results directory if it doesn't exist
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# specify models to fit
models = {
    "Linear": LinearRegression(),
    "Ridge": RidgeCV(alphas=np.logspace(-3, 3, 100)),
    "Lasso": LassoCV(),
    "KernelRidge": KernelRidge(kernel="rbf", alpha=0.1),
    "RF": RandomForestRegressor(
        max_features=0.33, min_samples_leaf=5, random_state=42
    ),
    "GBM": GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    ),
    "MLP": MLPRegressor(
        hidden_layer_sizes=(1000, 500, 100), max_iter=1000, random_state=42
    )
}

# model-specific feature importances (if applicable)
fi_models = {
    "Linear": get_coefficients,
    "Ridge": get_coefficients,
    "Lasso": get_coefficients,
    "RF": get_feature_importances,
    "GBM": get_feature_importances
}

Simulation 1 (Uncorrelated Variables)

In this first simulation, we will generate a covariate matrix \(X\) with \(n=1000\) samples and \(p=100\) independent standard normal (i.e., \(N(0, 1)\)) features. The response variable \(y\) is then generated via:

\[ y = X_1 + X_2 + X_3 + \epsilon \]

where \(\epsilon \stackrel{iid}{\sim} N(0, \sigma^2)\).

Show Code
n = 1000
p = 100
sigma = 2

varnames = [f"Variable {i}" for i in range(1, p+1)]
X = np.random.normal(size=(n, p))
y = X[:, 0] + X[:, 1] + X[:, 2] + np.random.normal(size=n, scale=sigma)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=331
)

Using this data as input, we next fit a variety of models to the data and compare the feature importances estimated by SHAP, permutation feature importance, and model-specific feature importances.

What would we like to see?

  • the first three features have the highest feature importances
  • the first three features should have similar feature importances
Show Code
if os.path.exists(oj(RESULTS_DIR, "simulation_1.pkl")):
    # Read in cached results if they already exist
    results = pkl.load(open(oj(RESULTS_DIR, "simulation_1.pkl"), "rb"))
else:
    # Run the simulation if cached results do not exist
    train_shap_df = pd.DataFrame({"feature": varnames})
    test_shap_df = pd.DataFrame({"feature": varnames})
    train_perm_df = pd.DataFrame({"feature": varnames})
    test_perm_df = pd.DataFrame({"feature": varnames})
    model_fi_df = pd.DataFrame({"feature": varnames})
    for name, model in models.items():
        model = copy.deepcopy(model)

        # fit model
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        err = mean_squared_error(y_test, preds)
        print(f"{name} MSE: {err:.4f}")

        # estimate SHAP values
        train_shap_df[name] = fit_shap(model, X_train, X_train)
        test_shap_df[name] = fit_shap(model, X_train, X_test)

        # estimate permutation feature importances
        train_perm_df[name] = permutation_importance(
            model, X_train, y_train, n_repeats=30, random_state=0
        )['importances_mean']
        test_perm_df[name] = permutation_importance(
            model, X_test, y_test, n_repeats=30, random_state=0
        )['importances_mean']

        # estimate model-specific feature importances
        if name in fi_models.keys():
            model_fi_df[name] = fi_models[name](model)

    # save results
    results = {
        "train_shap_df": train_shap_df,
        "test_shap_df": test_shap_df,
        "train_perm_df": train_perm_df,
        "test_perm_df": test_perm_df,
        "model_fi_df": model_fi_df
    }
    pkl.dump(results, open(oj(RESULTS_DIR, "simulation_1.pkl"), "wb"))

What do we see?

Resulting feature importances agree with what we hoped to see:

  • the first three features have the highest feature importances
  • the first three features should have similar feature importances

Simulation 2 (Correlated Variables)

In this second simulation, we will generate a covariate matrix \(X\) with \(n=1000\) samples and \(p=100\) correlated normal (i.e., \(X \sim N(0, \Sigma)\)) features, where \(\Sigma\) is a block diagonal covariance matrix with the following structure:

Show Code
cov_mat = generate_block_covariance_matrix([1, 2, 20, 20, 57], [0, 0.9, 0.9, 0.9, 0])
plt.imshow(cov_mat, cmap="Blues")
plt.colorbar()
plt.title("Block Diagonal Covariance Matrix")
plt.xlabel("Feature Index")
plt.ylabel("Feature Index")
plt.show()

The response variable \(y\) is then generated via:

\[ y = X_1 + X_2 + X_4 + \epsilon \]

where \(\epsilon \stackrel{iid}{\sim} N(0, \sigma^2)\).

In this correlated simulation, there are several blocks of features:

  • Block 1 (signal, uncorrelated): contains feature 1, which is a signal feature and uncorrelated with all other features.
  • Block 2 (signal, correlated, small block): contains features 2-3, which are highly correlated with each other but uncorrelated with all other features, and only feature 2 is a signal feature.
  • Block 3 (signal, correlated, large block): contains features 4-23, which are highly correlated with each other but uncorrelated with all other features, and only feature 4 is a signal feature.
  • Block 4 (noise, correlated): contains features 24-43, which are highly correlated with each other but uncorrelated with all other features, and none of these features are signal features.
  • Block 5 (noise, uncorrelated): contains features 44-100, which are uncorrelated with all other features and none of these features are signal features.
Show Code
n = 1000
p = 100
sigma = 2

varnames = [f"Variable {i}" for i in range(1, p+1)]
X = np.random.multivariate_normal(mean=np.zeros(p), cov=cov_mat, size=n)
y = X[:, 0] + X[:, 1] + X[:, 3] + np.random.normal(size=n, scale=sigma)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=331
)

Using this data as input, we next fit a variety of models to the data and compare the feature importances estimated by SHAP, permutation feature importance, and model-specific feature importances.

What would we like to see?

  • Features 1, 2, and 4 have the highest feature importances
  • Features 1, 2, and 4 have similar feature importances
  • All other features have equally low feature importances
Show Code
if os.path.exists(oj(RESULTS_DIR, "simulation_2.pkl")):
    # Read in cached results if they already exist
    results = pkl.load(open(oj(RESULTS_DIR, "simulation_2.pkl"), "rb"))
else:
    # Run the simulation if cached results do not exist
    train_shap_df = pd.DataFrame({"feature": varnames})
    test_shap_df = pd.DataFrame({"feature": varnames})
    train_perm_df = pd.DataFrame({"feature": varnames})
    test_perm_df = pd.DataFrame({"feature": varnames})
    model_fi_df = pd.DataFrame({"feature": varnames})
    for name, model in models.items():
        model = copy.deepcopy(model)

        # fit model
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        err = mean_squared_error(y_test, preds)
        print(f"{name} MSE: {err:.4f}")

        # estimate SHAP values
        train_shap_df[name] = fit_shap(model, X_train, X_train)
        test_shap_df[name] = fit_shap(model, X_train, X_test)

        # estimate permutation feature importances
        train_perm_df[name] = permutation_importance(
            model, X_train, y_train, n_repeats=30, random_state=0
        )['importances_mean']
        test_perm_df[name] = permutation_importance(
            model, X_test, y_test, n_repeats=30, random_state=0
        )['importances_mean']

        # estimate model-specific feature importances
        if name in fi_models.keys():
            model_fi_df[name] = fi_models[name](model)

    # save results
    results = {
        "train_shap_df": train_shap_df,
        "test_shap_df": test_shap_df,
        "train_perm_df": train_perm_df,
        "test_perm_df": test_perm_df,
        "model_fi_df": model_fi_df
    }
    pkl.dump(results, open(oj(RESULTS_DIR, "simulation_2.pkl"), "wb"))

What do we see?

Correlated variables complicate the interpretation of feature importances. In this simulation, we see that:

  • Despite the “true” importance of features 1, 2, and 4 being equal, the importance of features 2 and 4 are sometimes artificially diluted (i.e., lower than the importance of feature 1) due to correlations with other features. In other words, the correlated features are “stealing” importance from the signal features.