3. Regression
3.1. Generate data
\(y = 5.3 + 3 X_0 - 2.9 X_1\)
[1]:
import numpy as np
from numpy.random import normal
from collections import namedtuple
Data = namedtuple('Data', 'X y')
np.random.seed(37)
def get_data(N=10000):
x0 = normal(0, 1, N)
x1 = normal(3, 1, N)
X = np.hstack([x0.reshape(-1, 1), x1.reshape(-1, 1)])
y = 5.3 + (3.0 * x0) - (2.9 * x1) + normal(0, 1, N)
return Data(X, y)
# training data
T = get_data()
# validation data
V = get_data(N=1000)
3.2. Types of regression
3.2.1. Basic regression
[2]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(T.X, T.y)
[2]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
3.2.2. Ridge regression
[3]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=.5)
ridge.fit(T.X, T.y)
[3]:
Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001)
3.2.3. Lasso regression
[4]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.5)
lasso.fit(T.X, T.y)
[4]:
Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)
3.2.4. ElasticNet regression
[5]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha=.5)
en.fit(T.X, T.y)
[5]:
ElasticNet(alpha=0.5, copy_X=True, fit_intercept=True, l1_ratio=0.5,
max_iter=1000, normalize=False, positive=False, precompute=False,
random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
3.2.5. Bayesian ridge regression
[6]:
from sklearn.linear_model import BayesianRidge
brr = BayesianRidge()
brr.fit(T.X, T.y)
[6]:
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
normalize=False, tol=0.001, verbose=False)
3.2.6. Random forest regression
[7]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=37)
rf.fit(T.X, T.y)
[7]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=37, verbose=0,
warm_start=False)
3.2.7. Gradient boosting regression
[8]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=37)
gbr.fit(T.X, T.y)
[8]:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='auto',
random_state=37, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0, warm_start=False)
3.2.8. AdaBoost regression
[9]:
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(random_state=37)
abr.fit(T.X, T.y)
[9]:
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
n_estimators=50, random_state=37)
3.2.9. Bagging regression
[10]:
from sklearn.ensemble import BaggingRegressor
bagr = BaggingRegressor(random_state=37)
bagr.fit(T.X, T.y)
[10]:
BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
max_features=1.0, max_samples=1.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=37, verbose=0,
warm_start=False)
[11]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(random_state=37)
mlp.fit(T.X, T.y)
[11]:
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=37, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False)
3.3. Performance
[12]:
models = [lr, ridge, lasso, en, brr, rf, gbr, abr, bagr, mlp]
3.3.1. Explained variance score
[13]:
from sklearn.metrics import explained_variance_score
def get_score(model, X, y_true):
y_pred = model.predict(X)
score = explained_variance_score(y_true, y_pred)
return score, type(model).__name__
def print_scores(scores):
for score, name in scores:
print(f'{score:.5f} : {name}')
scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=True)
print_scores(scores)
0.94408 : BayesianRidge
0.94408 : LinearRegression
0.94408 : Ridge
0.94380 : MLPRegressor
0.94192 : GradientBoostingRegressor
0.93500 : RandomForestRegressor
0.93120 : BaggingRegressor
0.91760 : AdaBoostRegressor
0.91650 : Lasso
0.87551 : ElasticNet
3.3.2. Max error
[14]:
from sklearn.metrics import max_error
def get_score(model, X, y_true):
y_pred = model.predict(X)
score = max_error(y_true, y_pred)
return score, type(model).__name__
scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)
3.03046 : GradientBoostingRegressor
3.19835 : LinearRegression
3.19836 : BayesianRidge
3.19839 : Ridge
3.23994 : MLPRegressor
3.36062 : RandomForestRegressor
3.83427 : BaggingRegressor
4.38630 : AdaBoostRegressor
4.64630 : Lasso
5.57094 : ElasticNet
3.3.3. Mean absolute error
[15]:
from sklearn.metrics import mean_absolute_error
def get_score(model, X, y_true):
y_pred = model.predict(X)
score = mean_absolute_error(y_true, y_pred)
return score, type(model).__name__
scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)
0.80887 : LinearRegression
0.80887 : BayesianRidge
0.80887 : Ridge
0.81840 : MLPRegressor
0.82440 : GradientBoostingRegressor
0.87912 : RandomForestRegressor
0.90288 : BaggingRegressor
0.98418 : AdaBoostRegressor
0.99474 : Lasso
1.21903 : ElasticNet
3.3.4. Mean squared error
[16]:
from sklearn.metrics import mean_squared_error
def get_score(model, X, y_true):
y_pred = model.predict(X)
score = mean_squared_error(y_true, y_pred)
return score, type(model).__name__
scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)
1.04706 : LinearRegression
1.04706 : BayesianRidge
1.04706 : Ridge
1.06718 : MLPRegressor
1.08764 : GradientBoostingRegressor
1.21693 : RandomForestRegressor
1.28816 : BaggingRegressor
1.54414 : AdaBoostRegressor
1.56425 : Lasso
2.33287 : ElasticNet
3.3.5. Median absolute error
[17]:
from sklearn.metrics import median_absolute_error
def get_score(model, X, y_true):
y_pred = model.predict(X)
score = median_absolute_error(y_true, y_pred)
return score, type(model).__name__
scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)
0.65824 : Ridge
0.65827 : BayesianRidge
0.65828 : LinearRegression
0.66430 : GradientBoostingRegressor
0.66623 : MLPRegressor
0.76039 : RandomForestRegressor
0.77582 : BaggingRegressor
0.82732 : AdaBoostRegressor
0.83284 : Lasso
1.01466 : ElasticNet
3.3.6. r-squared
[18]:
from sklearn.metrics import r2_score
def get_score(model, X, y_true):
y_pred = model.predict(X)
score = r2_score(y_true, y_pred)
return score, type(model).__name__
scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=True)
print_scores(scores)
0.94408 : LinearRegression
0.94408 : BayesianRidge
0.94408 : Ridge
0.94300 : MLPRegressor
0.94191 : GradientBoostingRegressor
0.93500 : RandomForestRegressor
0.93120 : BaggingRegressor
0.91753 : AdaBoostRegressor
0.91645 : Lasso
0.87540 : ElasticNet