# 13. Pipelines

## 13.1. Regression pipeline

### 13.1.1. Get data

[1]:

import numpy as np
from random import randint
from sklearn.datasets import make_regression
from sklearn.model_selection import ShuffleSplit
from collections import namedtuple

Data = namedtuple('Data', 'X y')

def get_data(n_features=20, n_samples=2000, n_missing=100):
def generate_coordinates(m, n):
seen = set()

x, y = randint(0, m - 1), randint(0, n - 1)

while True:
yield (x, y)
x, y = randint(0, m - 1), randint(0, n - 1)
while (x, y) in seen:
x, y = randint(0, m - 1), randint(0, n - 1)

def make_missing(X):
coords = generate_coordinates(n_samples, n_features)

for _ in range(n_missing):
i, j = next(coords)
X[i][j] = np.nan

def split(X, y):
splitter = ShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=37)
train_index, test_index = next(splitter.split(X, y))
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
return X_train, y_train, X_test, y_test

X, y = make_regression(**{
'n_samples': n_samples,
'n_features': n_features,
'n_informative': 10,
'n_targets': 1,
'bias': 5.3,
'random_state': 37
})

make_missing(X)

X_train, y_train, X_test, y_test = split(X, y)
return Data(X_train, y_train), Data(X_test, y_test)

np.random.seed(37)

T, V = get_data()

print(f'X training shape={T.X.shape}, y training shape={T.y.shape}')
print(f'X validation shape={V.X.shape}, y validation shape={T.y.shape}')

print(f'X training missing data points {np.count_nonzero(np.isnan(T.X))}')
print(f'X validation missing data points {np.count_nonzero(np.isnan(V.X))}')

X training shape=(1600, 20), y training shape=(1600,)
X validation shape=(400, 20), y validation shape=(1600,)
X training missing data points 90
X validation missing data points 10


### 13.1.2. Pipeline

[2]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score

imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
lasso = Lasso()

pipeline = Pipeline([
('imputer', imputer),
('scaler', scaler),
('lasso', lasso)
])

pipeline.fit(T.X, T.y)
y_preds = pipeline.predict(V.X)

print(f'{r2_score(V.y, y_preds):.5f}: r-squared')

0.99790: r-squared


## 13.2. Classification pipeline

### 13.2.1. Get data

[3]:

import numpy as np
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit
from collections import namedtuple

Data = namedtuple('Data', 'X y')

def get_data(n_features=20, n_samples=2000, n_missing=100):
def generate_coordinates(m, n):
seen = set()

x, y = randint(0, m - 1), randint(0, n - 1)

while True:
yield (x, y)
x, y = randint(0, m - 1), randint(0, n - 1)
while (x, y) in seen:
x, y = randint(0, m - 1), randint(0, n - 1)

def make_missing(X):
coords = generate_coordinates(n_samples, n_features)

for _ in range(n_missing):
i, j = next(coords)
X[i][j] = np.nan

def split(X, y):
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=37)
train_index, test_index = next(splitter.split(X, y))
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
return X_train, y_train, X_test, y_test

X, y = make_classification(**{
'n_samples': n_samples,
'n_features': n_features,
'n_informative': 2,
'n_redundant': 2,
'n_repeated': 0,
'n_classes': 2,
'n_clusters_per_class': 2,
'random_state': 37
})

make_missing(X)

X_train, y_train, X_test, y_test = split(X, y)
return Data(X_train, y_train), Data(X_test, y_test)

np.random.seed(37)

T, V = get_data()

print(f'X training shape={T.X.shape}, y training shape={T.y.shape}')
print(f'X validation shape={V.X.shape}, y validation shape={T.y.shape}')

print(f'X training missing data points {np.count_nonzero(np.isnan(T.X))}')
print(f'X validation missing data points {np.count_nonzero(np.isnan(V.X))}')

X training shape=(1600, 20), y training shape=(1600,)
X validation shape=(400, 20), y validation shape=(1600,)
X training missing data points 81
X validation missing data points 19


### 13.2.2. Pipeline

[4]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score, average_precision_score

imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)
rf = RandomForestClassifier(n_estimators=100)

pipeline = Pipeline([
('imputer', imputer),
('scaler', scaler),
('pca', pca),
('rf', rf)
])

pipeline.fit(T.X, T.y)
y_preds = pipeline.predict_proba(V.X)[:,1]

print(f'{roc_auc_score(V.y, y_preds):.5f}: ROC AUC')
print(f'{average_precision_score(V.y, y_preds):.5f}: PR AUC')

0.95290: ROC AUC
0.94265: PR AUC