# 3. Regression

## 3.1. Generate data

$$y = 5.3 + 3 X_0 - 2.9 X_1$$

[1]:

import numpy as np
from numpy.random import normal
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(N=10000):
x0 = normal(0, 1, N)
x1 = normal(3, 1, N)

X = np.hstack([x0.reshape(-1, 1), x1.reshape(-1, 1)])
y = 5.3 + (3.0 * x0) - (2.9 * x1) + normal(0, 1, N)
return Data(X, y)

# training data
T = get_data()

# validation data
V = get_data(N=1000)


## 3.2. Types of regression

### 3.2.1. Basic regression

[2]:

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(T.X, T.y)

[2]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


### 3.2.2. Ridge regression

[3]:

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=.5)
ridge.fit(T.X, T.y)

[3]:

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001)


### 3.2.3. Lasso regression

[4]:

from sklearn.linear_model import Lasso

lasso = Lasso(alpha=.5)
lasso.fit(T.X, T.y)

[4]:

Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)


### 3.2.4. ElasticNet regression

[5]:

from sklearn.linear_model import ElasticNet

en = ElasticNet(alpha=.5)
en.fit(T.X, T.y)

[5]:

ElasticNet(alpha=0.5, copy_X=True, fit_intercept=True, l1_ratio=0.5,
max_iter=1000, normalize=False, positive=False, precompute=False,
random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


### 3.2.5. Bayesian ridge regression

[6]:

from sklearn.linear_model import BayesianRidge

brr = BayesianRidge()
brr.fit(T.X, T.y)

[6]:

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
normalize=False, tol=0.001, verbose=False)


### 3.2.6. Random forest regression

[7]:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=37)
rf.fit(T.X, T.y)

[7]:

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=37, verbose=0,
warm_start=False)


[8]:

from sklearn.ensemble import GradientBoostingRegressor

gbr.fit(T.X, T.y)

[8]:

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='auto',
random_state=37, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0, warm_start=False)


[9]:

from sklearn.ensemble import AdaBoostRegressor

abr.fit(T.X, T.y)

[9]:

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
n_estimators=50, random_state=37)


### 3.2.9. Bagging regression

[10]:

from sklearn.ensemble import BaggingRegressor

bagr = BaggingRegressor(random_state=37)
bagr.fit(T.X, T.y)

[10]:

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
max_features=1.0, max_samples=1.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=37, verbose=0,
warm_start=False)

[11]:

from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=37)
mlp.fit(T.X, T.y)

[11]:

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
validation_fraction=0.1, verbose=False, warm_start=False)


## 3.3. Performance

[12]:

models = [lr, ridge, lasso, en, brr, rf, gbr, abr, bagr, mlp]


### 3.3.1. Explained variance score

[13]:

from sklearn.metrics import explained_variance_score

def get_score(model, X, y_true):
y_pred = model.predict(X)
score = explained_variance_score(y_true, y_pred)
return score, type(model).__name__

def print_scores(scores):
for score, name in scores:
print(f'{score:.5f} : {name}')

scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=True)
print_scores(scores)

0.94408 : BayesianRidge
0.94408 : LinearRegression
0.94408 : Ridge
0.94380 : MLPRegressor
0.93500 : RandomForestRegressor
0.93120 : BaggingRegressor
0.91650 : Lasso
0.87551 : ElasticNet


### 3.3.2. Max error

[14]:

from sklearn.metrics import max_error

def get_score(model, X, y_true):
y_pred = model.predict(X)
score = max_error(y_true, y_pred)
return score, type(model).__name__

scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)

3.03046 : GradientBoostingRegressor
3.19835 : LinearRegression
3.19836 : BayesianRidge
3.19839 : Ridge
3.23994 : MLPRegressor
3.36062 : RandomForestRegressor
3.83427 : BaggingRegressor
4.64630 : Lasso
5.57094 : ElasticNet


### 3.3.3. Mean absolute error

[15]:

from sklearn.metrics import mean_absolute_error

def get_score(model, X, y_true):
y_pred = model.predict(X)
score = mean_absolute_error(y_true, y_pred)
return score, type(model).__name__

scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)

0.80887 : LinearRegression
0.80887 : BayesianRidge
0.80887 : Ridge
0.81840 : MLPRegressor
0.87912 : RandomForestRegressor
0.90288 : BaggingRegressor
0.99474 : Lasso
1.21903 : ElasticNet


### 3.3.4. Mean squared error

[16]:

from sklearn.metrics import mean_squared_error

def get_score(model, X, y_true):
y_pred = model.predict(X)
score = mean_squared_error(y_true, y_pred)
return score, type(model).__name__

scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)

1.04706 : LinearRegression
1.04706 : BayesianRidge
1.04706 : Ridge
1.06718 : MLPRegressor
1.21693 : RandomForestRegressor
1.28816 : BaggingRegressor
1.56425 : Lasso
2.33287 : ElasticNet


### 3.3.5. Median absolute error

[17]:

from sklearn.metrics import median_absolute_error

def get_score(model, X, y_true):
y_pred = model.predict(X)
score = median_absolute_error(y_true, y_pred)
return score, type(model).__name__

scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=False)
print_scores(scores)

0.65824 : Ridge
0.65827 : BayesianRidge
0.65828 : LinearRegression
0.66623 : MLPRegressor
0.76039 : RandomForestRegressor
0.77582 : BaggingRegressor
0.83284 : Lasso
1.01466 : ElasticNet


### 3.3.6. r-squared

[18]:

from sklearn.metrics import r2_score

def get_score(model, X, y_true):
y_pred = model.predict(X)
score = r2_score(y_true, y_pred)
return score, type(model).__name__

scores = sorted([get_score(model, V.X, V.y) for model in models], key=lambda tup: tup[0], reverse=True)
print_scores(scores)

0.94408 : LinearRegression
0.94408 : BayesianRidge
0.94408 : Ridge
0.94300 : MLPRegressor