Correlated Variables Notebook

Author

Prof. Tiffany Tang

Published

May 8, 2026

Show Code

import copy
import os
from os.path import join as oj
import pickle as pkl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from plotnine import *

import sys
sys.path.append("..")
from python.simulate_data import generate_block_covariance_matrix
from python.utils import get_feature_importances, get_coefficients, fit_shap

np.random.seed(12345)

RESULTS_DIR = oj("..", "results")
# Create results directory if it doesn't exist
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# specify models to fit
models = {
    "Linear": LinearRegression(),
    "Ridge": RidgeCV(alphas=np.logspace(-3, 3, 100)),
    "Lasso": LassoCV(),
    "KernelRidge": KernelRidge(kernel="rbf", alpha=0.1),
    "RF": RandomForestRegressor(
        max_features=0.33, min_samples_leaf=5, random_state=42
    ),
    "GBM": GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    ),
    "MLP": MLPRegressor(
        hidden_layer_sizes=(1000, 500, 100), max_iter=1000, random_state=42
    )
}

# model-specific feature importances (if applicable)
fi_models = {
    "Linear": get_coefficients,
    "Ridge": get_coefficients,
    "Lasso": get_coefficients,
    "RF": get_feature_importances,
    "GBM": get_feature_importances
}

Simulation 1 (Uncorrelated Variables)

In this first simulation, we will generate a covariate matrix $X$ with $n=1000$ samples and $p=100$ independent standard normal (i.e., $N(0, 1)$) features. The response variable $y$ is then generated via:

\[ y = X_1 + X_2 + X_3 + \epsilon \]

where $\epsilon \stackrel{iid}{\sim} N(0, \sigma^2)$.

Show Code

n = 1000
p = 100
sigma = 2

varnames = [f"Variable {i}" for i in range(1, p+1)]
X = np.random.normal(size=(n, p))
y = X[:, 0] + X[:, 1] + X[:, 2] + np.random.normal(size=n, scale=sigma)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=331
)

Using this data as input, we next fit a variety of models to the data and compare the feature importances estimated by SHAP, permutation feature importance, and model-specific feature importances.

What would we like to see?

the first three features have the highest feature importances
the first three features should have similar feature importances

Show Code

if os.path.exists(oj(RESULTS_DIR, "simulation_1.pkl")):
    # Read in cached results if they already exist
    results = pkl.load(open(oj(RESULTS_DIR, "simulation_1.pkl"), "rb"))
else:
    # Run the simulation if cached results do not exist
    train_shap_df = pd.DataFrame({"feature": varnames})
    test_shap_df = pd.DataFrame({"feature": varnames})
    train_perm_df = pd.DataFrame({"feature": varnames})
    test_perm_df = pd.DataFrame({"feature": varnames})
    model_fi_df = pd.DataFrame({"feature": varnames})
    for name, model in models.items():
        model = copy.deepcopy(model)

        # fit model
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        err = mean_squared_error(y_test, preds)
        print(f"{name} MSE: {err:.4f}")

        # estimate SHAP values
        train_shap_df[name] = fit_shap(model, X_train, X_train)
        test_shap_df[name] = fit_shap(model, X_train, X_test)

        # estimate permutation feature importances
        train_perm_df[name] = permutation_importance(
            model, X_train, y_train, n_repeats=30, random_state=0
        )['importances_mean']
        test_perm_df[name] = permutation_importance(
            model, X_test, y_test, n_repeats=30, random_state=0
        )['importances_mean']

        # estimate model-specific feature importances
        if name in fi_models.keys():
            model_fi_df[name] = fi_models[name](model)

    # save results
    results = {
        "train_shap_df": train_shap_df,
        "test_shap_df": test_shap_df,
        "train_perm_df": train_perm_df,
        "test_perm_df": test_perm_df,
        "model_fi_df": model_fi_df
    }
    pkl.dump(results, open(oj(RESULTS_DIR, "simulation_1.pkl"), "wb"))

What do we see?

Resulting feature importances agree with what we hoped to see:

the first three features have the highest feature importances
the first three features should have similar feature importances

Simulation 2 (Correlated Variables)

In this second simulation, we will generate a covariate matrix $X$ with $n=1000$ samples and $p=100$ correlated normal (i.e., $X \sim N(0, \Sigma)$) features, where $\Sigma$ is a block diagonal covariance matrix with the following structure:

Show Code

cov_mat = generate_block_covariance_matrix([1, 2, 20, 20, 57], [0, 0.9, 0.9, 0.9, 0])
plt.imshow(cov_mat, cmap="Blues")
plt.colorbar()
plt.title("Block Diagonal Covariance Matrix")
plt.xlabel("Feature Index")
plt.ylabel("Feature Index")
plt.show()

The response variable $y$ is then generated via:

\[ y = X_1 + X_2 + X_4 + \epsilon \]

where $\epsilon \stackrel{iid}{\sim} N(0, \sigma^2)$.

In this correlated simulation, there are several blocks of features:

Block 1 (signal, uncorrelated): contains feature 1, which is a signal feature and uncorrelated with all other features.
Block 2 (signal, correlated, small block): contains features 2-3, which are highly correlated with each other but uncorrelated with all other features, and only feature 2 is a signal feature.
Block 3 (signal, correlated, large block): contains features 4-23, which are highly correlated with each other but uncorrelated with all other features, and only feature 4 is a signal feature.
Block 4 (noise, correlated): contains features 24-43, which are highly correlated with each other but uncorrelated with all other features, and none of these features are signal features.
Block 5 (noise, uncorrelated): contains features 44-100, which are uncorrelated with all other features and none of these features are signal features.

Show Code

n = 1000
p = 100
sigma = 2

varnames = [f"Variable {i}" for i in range(1, p+1)]
X = np.random.multivariate_normal(mean=np.zeros(p), cov=cov_mat, size=n)
y = X[:, 0] + X[:, 1] + X[:, 3] + np.random.normal(size=n, scale=sigma)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=331
)

Using this data as input, we next fit a variety of models to the data and compare the feature importances estimated by SHAP, permutation feature importance, and model-specific feature importances.

What would we like to see?

Features 1, 2, and 4 have the highest feature importances
Features 1, 2, and 4 have similar feature importances
All other features have equally low feature importances

Show Code

if os.path.exists(oj(RESULTS_DIR, "simulation_2.pkl")):
    # Read in cached results if they already exist
    results = pkl.load(open(oj(RESULTS_DIR, "simulation_2.pkl"), "rb"))
else:
    # Run the simulation if cached results do not exist
    train_shap_df = pd.DataFrame({"feature": varnames})
    test_shap_df = pd.DataFrame({"feature": varnames})
    train_perm_df = pd.DataFrame({"feature": varnames})
    test_perm_df = pd.DataFrame({"feature": varnames})
    model_fi_df = pd.DataFrame({"feature": varnames})
    for name, model in models.items():
        model = copy.deepcopy(model)

        # fit model
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        err = mean_squared_error(y_test, preds)
        print(f"{name} MSE: {err:.4f}")

        # estimate SHAP values
        train_shap_df[name] = fit_shap(model, X_train, X_train)
        test_shap_df[name] = fit_shap(model, X_train, X_test)

        # estimate permutation feature importances
        train_perm_df[name] = permutation_importance(
            model, X_train, y_train, n_repeats=30, random_state=0
        )['importances_mean']
        test_perm_df[name] = permutation_importance(
            model, X_test, y_test, n_repeats=30, random_state=0
        )['importances_mean']

        # estimate model-specific feature importances
        if name in fi_models.keys():
            model_fi_df[name] = fi_models[name](model)

    # save results
    results = {
        "train_shap_df": train_shap_df,
        "test_shap_df": test_shap_df,
        "train_perm_df": train_perm_df,
        "test_perm_df": test_perm_df,
        "model_fi_df": model_fi_df
    }
    pkl.dump(results, open(oj(RESULTS_DIR, "simulation_2.pkl"), "wb"))

What do we see?

Correlated variables complicate the interpretation of feature importances. In this simulation, we see that:

Despite the “true” importance of features 1, 2, and 4 being equal, the importance of features 2 and 4 are sometimes artificially diluted (i.e., lower than the importance of feature 1) due to correlations with other features. In other words, the correlated features are “stealing” importance from the signal features.

--- title: "Correlated Variables Notebook" author: "Prof. Tiffany Tang" date: today format: html: code-fold: true code-summary: "Show Code" code-tools: true theme: sandstone lightbox: true # theme: yeti embed-resources: true jupyter: python3 toc: true execute: warning: false message: false --- ```{=html} <style> .nav-pills .nav-link.active { color: black; } .tab-content .nav-pills { border: none; } .nav-pills>ul { border: none; } .nav-tabs .nav-item { padding-right: 0.1em; } </style> ``` ```{python} import copy import os from os.path import join as oj import pickle as pkl import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV from sklearn.kernel_ridge import KernelRidge from sklearn.neural_network import MLPRegressor from sklearn.metrics import mean_squared_error from sklearn.inspection import permutation_importance from plotnine import * import sys sys.path.append("..") from python.simulate_data import generate_block_covariance_matrix from python.utils import get_feature_importances, get_coefficients, fit_shap np.random.seed(12345) RESULTS_DIR = oj("..", "results") # Create results directory if it doesn't exist if not os.path.exists(RESULTS_DIR): os.makedirs(RESULTS_DIR) # specify models to fit models = { "Linear": LinearRegression(), "Ridge": RidgeCV(alphas=np.logspace(-3, 3, 100)), "Lasso": LassoCV(), "KernelRidge": KernelRidge(kernel="rbf", alpha=0.1), "RF": RandomForestRegressor( max_features=0.33, min_samples_leaf=5, random_state=42 ), "GBM": GradientBoostingRegressor( n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42 ), "MLP": MLPRegressor( hidden_layer_sizes=(1000, 500, 100), max_iter=1000, random_state=42 ) } # model-specific feature importances (if applicable) fi_models = { "Linear": get_coefficients, "Ridge": get_coefficients, "Lasso": get_coefficients, "RF": get_feature_importances, "GBM": get_feature_importances } ``` # Simulation 1 (Uncorrelated Variables) In this first simulation, we will generate a covariate matrix $X$ with $n=1000$ samples and $p=100$ independent standard normal (i.e., $N(0, 1)$) features. The response variable $y$ is then generated via: $$ y = X_1 + X_2 + X_3 + \epsilon $$ where $\epsilon \stackrel{iid}{\sim} N(0, \sigma^2)$. ```{python} n = 1000 p = 100 sigma = 2 varnames = [f"Variable {i}" for i in range(1, p+1)] X = np.random.normal(size=(n, p)) y = X[:, 0] + X[:, 1] + X[:, 2] + np.random.normal(size=n, scale=sigma) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=331 ) ``` Using this data as input, we next fit a variety of models to the data and compare the feature importances estimated by SHAP, permutation feature importance, and model-specific feature importances. **What would we like to see?** - the first three features have the highest feature importances - the first three features should have similar feature importances ```{python} if os.path.exists(oj(RESULTS_DIR, "simulation_1.pkl")): # Read in cached results if they already exist results = pkl.load(open(oj(RESULTS_DIR, "simulation_1.pkl"), "rb")) else: # Run the simulation if cached results do not exist train_shap_df = pd.DataFrame({"feature": varnames}) test_shap_df = pd.DataFrame({"feature": varnames}) train_perm_df = pd.DataFrame({"feature": varnames}) test_perm_df = pd.DataFrame({"feature": varnames}) model_fi_df = pd.DataFrame({"feature": varnames}) for name, model in models.items(): model = copy.deepcopy(model) # fit model model.fit(X_train, y_train) preds = model.predict(X_test) err = mean_squared_error(y_test, preds) print(f"{name} MSE: {err:.4f}") # estimate SHAP values train_shap_df[name] = fit_shap(model, X_train, X_train) test_shap_df[name] = fit_shap(model, X_train, X_test) # estimate permutation feature importances train_perm_df[name] = permutation_importance( model, X_train, y_train, n_repeats=30, random_state=0 )['importances_mean'] test_perm_df[name] = permutation_importance( model, X_test, y_test, n_repeats=30, random_state=0 )['importances_mean'] # estimate model-specific feature importances if name in fi_models.keys(): model_fi_df[name] = fi_models[name](model) # save results results = { "train_shap_df": train_shap_df, "test_shap_df": test_shap_df, "train_perm_df": train_perm_df, "test_perm_df": test_perm_df, "model_fi_df": model_fi_df } pkl.dump(results, open(oj(RESULTS_DIR, "simulation_1.pkl"), "wb")) ``` **What do we see?** Resulting feature importances agree with what we hoped to see: - the first three features have the highest feature importances - the first three features should have similar feature importances ```{python} #| panel: tabset #| output: asis var_types = { "Signal": ["Variable " + str(i) for i in range(1, 4)] } fi_dict = { "Model-specific": results["model_fi_df"], "SHAP (test)": results["test_shap_df"], "SHAP (train)": results["train_shap_df"], "SHAP": pd.concat([ results["train_shap_df"].assign(split_mode="train"), results["test_shap_df"].assign(split_mode="test") ], axis=0), "Permutation (test)": results["test_perm_df"], "Permutation (train)": results["train_perm_df"], "Permutation": pd.concat([ results["train_perm_df"].assign(split_mode="train"), results["test_perm_df"].assign(split_mode="test") ], axis=0) } for fi_name, fi_df in fi_dict.items(): print(f"\n\n## {fi_name}\n\n") if fi_name in ["SHAP", "Permutation"]: id_vars = ["feature", "split_mode"] fill_var = "split_mode" fill_name = "Data Split" else: id_vars = ["feature"] fill_var = "var_type" fill_name = "Feature Type" plt_df = pd.melt( fi_df, id_vars=id_vars, var_name="model", value_name="importance" ) plt_df["feature"] = pd.Categorical( plt_df["feature"], categories=varnames, ordered=True ) plt_df["model"] = pd.Categorical( plt_df["model"], categories=list(models.keys()), ordered=True ) plt_df["var_type"] = plt_df["feature"].apply( lambda x: "Signal" if x in var_types["Signal"] else "Noise" ) ggplt = ( ggplot(plt_df) + aes(x="feature", y="importance", fill=fill_var) + geom_bar(stat="identity", position="dodge") + facet_grid("model ~ .", scales="free_y") + labs( title=f"{fi_name} Feature Importances", x="Feature", y="Importance", fill=fill_name ) + theme_minimal() + theme( axis_text_x=element_text(rotation=90, size=5), figure_size=(8, 8) ) ) ggplt.show() ``` # Simulation 2 (Correlated Variables) In this second simulation, we will generate a covariate matrix $X$ with $n=1000$ samples and $p=100$ correlated normal (i.e., $X \sim N(0, \Sigma)$) features, where $\Sigma$ is a block diagonal covariance matrix with the following structure: ```{python} cov_mat = generate_block_covariance_matrix([1, 2, 20, 20, 57], [0, 0.9, 0.9, 0.9, 0]) plt.imshow(cov_mat, cmap="Blues") plt.colorbar() plt.title("Block Diagonal Covariance Matrix") plt.xlabel("Feature Index") plt.ylabel("Feature Index") plt.show() ``` The response variable $y$ is then generated via: $$ y = X_1 + X_2 + X_4 + \epsilon $$ where $\epsilon \stackrel{iid}{\sim} N(0, \sigma^2)$. In this correlated simulation, there are several blocks of features: - **Block 1 (signal, uncorrelated)**: contains feature 1, which is a signal feature and uncorrelated with all other features. - **Block 2 (signal, correlated, small block)**: contains features 2-3, which are highly correlated with each other but uncorrelated with all other features, and only feature 2 is a signal feature. - **Block 3 (signal, correlated, large block)**: contains features 4-23, which are highly correlated with each other but uncorrelated with all other features, and only feature 4 is a signal feature. - **Block 4 (noise, correlated)**: contains features 24-43, which are highly correlated with each other but uncorrelated with all other features, and none of these features are signal features. - **Block 5 (noise, uncorrelated)**: contains features 44-100, which are uncorrelated with all other features and none of these features are signal features. ```{python} n = 1000 p = 100 sigma = 2 varnames = [f"Variable {i}" for i in range(1, p+1)] X = np.random.multivariate_normal(mean=np.zeros(p), cov=cov_mat, size=n) y = X[:, 0] + X[:, 1] + X[:, 3] + np.random.normal(size=n, scale=sigma) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=331 ) ``` Using this data as input, we next fit a variety of models to the data and compare the feature importances estimated by SHAP, permutation feature importance, and model-specific feature importances. **What would we like to see?** - Features 1, 2, and 4 have the highest feature importances - Features 1, 2, and 4 have similar feature importances - All other features have equally low feature importances ```{python} if os.path.exists(oj(RESULTS_DIR, "simulation_2.pkl")): # Read in cached results if they already exist results = pkl.load(open(oj(RESULTS_DIR, "simulation_2.pkl"), "rb")) else: # Run the simulation if cached results do not exist train_shap_df = pd.DataFrame({"feature": varnames}) test_shap_df = pd.DataFrame({"feature": varnames}) train_perm_df = pd.DataFrame({"feature": varnames}) test_perm_df = pd.DataFrame({"feature": varnames}) model_fi_df = pd.DataFrame({"feature": varnames}) for name, model in models.items(): model = copy.deepcopy(model) # fit model model.fit(X_train, y_train) preds = model.predict(X_test) err = mean_squared_error(y_test, preds) print(f"{name} MSE: {err:.4f}") # estimate SHAP values train_shap_df[name] = fit_shap(model, X_train, X_train) test_shap_df[name] = fit_shap(model, X_train, X_test) # estimate permutation feature importances train_perm_df[name] = permutation_importance( model, X_train, y_train, n_repeats=30, random_state=0 )['importances_mean'] test_perm_df[name] = permutation_importance( model, X_test, y_test, n_repeats=30, random_state=0 )['importances_mean'] # estimate model-specific feature importances if name in fi_models.keys(): model_fi_df[name] = fi_models[name](model) # save results results = { "train_shap_df": train_shap_df, "test_shap_df": test_shap_df, "train_perm_df": train_perm_df, "test_perm_df": test_perm_df, "model_fi_df": model_fi_df } pkl.dump(results, open(oj(RESULTS_DIR, "simulation_2.pkl"), "wb")) ``` **What do we see?** Correlated variables complicate the interpretation of feature importances. In this simulation, we see that: - Despite the "true" importance of features 1, 2, and 4 being equal, the importance of features 2 and 4 are sometimes artificially diluted (i.e., lower than the importance of feature 1) due to correlations with other features. In other words, the correlated features are "stealing" importance from the signal features. ```{python} #| panel: tabset #| output: asis var_types = { "Uncorrelated Signal": ["Variable 1"], "Correlated Signal (small)": ["Variable 2", "Variable 3"], "Correlated Signal (large)": ["Variable " + str(i) for i in range(4, 24)], "Correlated Noise": ["Variable " + str(i) for i in range(24, 44)], "Uncorrelated Noise": ["Variable " + str(i) for i in range(44, 101)] } fi_dict = { "Model-specific": results["model_fi_df"], "SHAP (test)": results["test_shap_df"], "SHAP (train)": results["train_shap_df"], "SHAP": pd.concat([ results["train_shap_df"].assign(split_mode="train"), results["test_shap_df"].assign(split_mode="test") ], axis=0), "Permutation (test)": results["test_perm_df"], "Permutation (train)": results["train_perm_df"], "Permutation": pd.concat([ results["train_perm_df"].assign(split_mode="train"), results["test_perm_df"].assign(split_mode="test") ], axis=0) } for fi_name, fi_df in fi_dict.items(): print(f"\n\n## {fi_name}\n\n") if fi_name in ["SHAP", "Permutation"]: id_vars = ["feature", "split_mode"] fill_var = "split_mode" fill_name = "Data Split" else: id_vars = ["feature"] fill_var = "var_type" fill_name = "Feature Type" plt_df = pd.melt( fi_df, id_vars=id_vars, var_name="model", value_name="importance" ) plt_df["feature"] = pd.Categorical( plt_df["feature"], categories=varnames, ordered=True ) plt_df["model"] = pd.Categorical( plt_df["model"], categories=list(models.keys()), ordered=True ) plt_df["var_type"] = plt_df["feature"].apply( lambda x: "Uncorrelated Signal" if x in var_types["Uncorrelated Signal"] else ("Correlated Signal (small)" if x in var_types["Correlated Signal (small)"] else ("Correlated Signal (large)" if x in var_types["Correlated Signal (large)"] else ("Correlated Noise" if x in var_types["Correlated Noise"] else ("Uncorrelated Noise" if x in var_types["Uncorrelated Noise"] else "Noise")))) ) ggplt = ( ggplot(plt_df) + aes(x="feature", y="importance", fill=fill_var) + geom_bar(stat="identity", position="dodge") + facet_grid("model ~ .", scales="free_y") + labs( title=f"{fi_name} Feature Importances", x="Feature", y="Importance", fill=fill_name ) + theme_minimal() + theme( axis_text_x=element_text(rotation=90, size=5), figure_size=(8, 8) ) ) ggplt.show() ```