import numpy as np
import pandas as pd
import plotly.express as px
import marginaleffects as me
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import warnings
'ignore') warnings.filterwarnings(
Trees purportedly have the ability to estimate effects (nonlinear, quadratic, etc.) not explicitly stated in the design matrix. I’ve regurgitated this assertion in an several interviews. I probably should see if it’s true.
Here are the libraries I will use. Minus marginaleffects, which would have made this process a lot easier.
Here’s a function to simulate random normal data with various means & standard deviations. I love goofy list/dict comprehensions, so I had to thow one in.
0)
np.random.seed(= lambda mu, sig, n: mu + sig * np.random.normal(size=n)
f
def sim_data(
int = 10_000,
n_samples: int = 3,
n_features: list = [-5,0,5],
means: list = [1,1,1]
stdevs: -> pd.DataFrame:
)
# having some fun with python folks
= pd.DataFrame({
d f'X{i+1}':f(mu=m,sig=s,n=n)
for i,(m,s,n)
in enumerate(zip(means, stdevs, [n_samples]*len(means)))
})return d
Linear
First I’ll start simple with a linear effect. I create an outcome variable from my simulated data & plot the raw data of the effect I’d like to measure.
= sim_data()
d = ['X1','X2','X3']
features 'Y'] = 10 + d['X1']*-2 + d['X2']*1 + d['X3']*2 + f(0, 1, d.shape[0])
d[='X1', y='Y', title='Simulated X & derived Y').show() px.scatter(d, x
Each model seems to recover the effect fairly well. I am holding the other variables, X2 & X3, at there average level.
# damn I really need marginaleffects
= LinearRegression().fit(d[features], d['Y'])
mod = GradientBoostingRegressor(n_estimators=200).fit(d[features], d['Y'])
gb = RandomForestRegressor(n_estimators=200).fit(d[features], d['Y'])
rf
= d[features].assign(X2 = lambda df: df['X2'].mean(), X3 = lambda df: df['X3'].mean())
newdata = mod.predict(newdata); g_preds = gb.predict(newdata); rf_preds = rf.predict(newdata)
l_preds 'Linear Regression'] = l_preds; newdata['Gradient Boosted Tree'] = g_preds; newdata['Random Forest'] = rf_preds
newdata[
= px.scatter(
fig
newdata,='X1',
x=['Linear Regression','Gradient Boosted Tree','Random Forest'],
y='Fitted Values @ X1'
title
)="Fitted Value")
fig.update_layout(yaxis_title fig.show()
Quadratic
How about a quadratic effect? This data has distortion, but I don’t think it is overwhelmingly obvious. This time I fit two linear regression, one which models the true effect & another that does not, and a gradient boosted tree.
'Y'] = 10 + d['X1']*-10 + (d['X2']**2)*2 + d['X3']*10 + f(0, 1, d.shape[0])
d[='X2', y='Y', title='Quadratic Effect: Simulated X & derived Y').show() px.scatter(d, x
As expected, the linear regression that ignores the quadratic effects misses entirely. The gradient boosted tree recovers the effect! Nice.
= smf.ols("Y ~ X1 + X2 + X3", data = d).fit()
lr_unmodeled = smf.ols("Y ~ X1 + I(X2**2) + X3", data = d).fit()
lr_modeled = GradientBoostingRegressor(n_estimators=100).fit(d[features], d['Y'])
gb
= d[features].assign(X1 = lambda df: df['X1'].mean(), X3 = lambda df: df['X3'].mean())
newdata = lr_unmodeled.predict(newdata); lr_m_preds = lr_modeled.predict(newdata); g_preds = gb.predict(newdata); rf_preds = rf.predict(newdata)
lr_un_preds 'Linear Regression (not-modeled)'] = lr_un_preds; newdata['Linear Regression (modeled)'] = lr_m_preds; newdata['Gradient Boosted Tree'] = g_preds
newdata[
= px.scatter(
fig
newdata,='X2',
x=['Linear Regression (not-modeled)','Linear Regression (modeled)', 'Gradient Boosted Tree'],
y='Fitted Values @ X2'
title
)="Fitted Value")
fig.update_layout(yaxis_title fig.show()
Non-Linear
What about a nonlinear effect? Same process as before. Our effect of interest is better hidden this time.
'Y'] = 10 + d['X1']*-20 + 2*np.exp(0.90 * d['X2']) + d['X3']*10 + f(0, 1, d.shape[0])
d[='X2', y='Y', title='Non-Linear Effect: Simulated X & derived Y').show() px.scatter(d, x
Again the gradient boosted machine recovers the effect. Nice again.
= smf.ols("Y ~ X1 + X2 + X3", data = d).fit()
lr_unmodeled = smf.ols("Y ~ X1 + I(2*np.exp(0.75 * X2)) + X3", data = d).fit()
lr_modeled = GradientBoostingRegressor(n_estimators=100).fit(d[features], d['Y'])
gb
= d[features].assign(X1 = lambda df: df['X1'].mean(), X3 = lambda df: df['X3'].mean())
newdata = lr_unmodeled.predict(newdata); lr_m_preds = lr_modeled.predict(newdata); g_preds = gb.predict(newdata); rf_preds = rf.predict(newdata)
lr_un_preds 'Linear Regression (not-modeled)'] = lr_un_preds; newdata['Linear Regression (modeled)'] = lr_m_preds; newdata['Gradient Boosted Tree'] = g_preds
newdata[
= px.scatter(
fig
newdata,='X2',
x=['Linear Regression (not-modeled)','Linear Regression (modeled)', 'Gradient Boosted Tree'],
y='Fitted Values @ X2'
title
)="Fitted Value")
fig.update_layout(yaxis_title fig.show()
Interaction
What about an unstated interaction? Not sure I formatted this one correct, but the output of the linear regression with the effect modeled & the gradient boosted tree are more similar than the non-modeled LR.
'X1_X3'] = d['X1']*d['X3']
d['Y'] = 10 + d['X1']*-10 + d['X2']*50 + d['X3']*10 + (d['X1_X3'])*5 + f(0, 1, d.shape[0])
d[='X1_X3', y='Y', title='Non-Linear Effect: Simulated X & derived Y').show() px.scatter(d, x
= smf.ols("Y ~ X1 + X2 + X3", data = d).fit()
lr_unmodeled = smf.ols("Y ~ X1 + X2 + X3 + X1:X3", data = d).fit()
lr_modeled = GradientBoostingRegressor(n_estimators=100).fit(d[features], d['Y'])
gb
= d[features].assign(X2 = lambda df: df['X2'].mean())
newdata = lr_unmodeled.predict(newdata); lr_m_preds = lr_modeled.predict(newdata); g_preds = gb.predict(newdata); rf_preds = rf.predict(newdata)
lr_un_preds 'Linear Regression (not-modeled)'] = lr_un_preds; newdata['Linear Regression (modeled)'] = lr_m_preds; newdata['Gradient Boosted Tree'] = g_preds
newdata[
= px.scatter(
fig =lambda df: df['X1'] * df['X3']),
newdata.assign(X1_X3='X1_X3',
x=['Linear Regression (not-modeled)','Linear Regression (modeled)', 'Gradient Boosted Tree'],
y='Fitted Values @ X1:X3'
title
)="Fitted Value")
fig.update_layout(yaxis_title fig.show()