13. Pipelines
13.1. Regression pipeline
13.1.1. Get data
[1]:
import numpy as np
from random import randint
from sklearn.datasets import make_regression
from sklearn.model_selection import ShuffleSplit
from collections import namedtuple
Data = namedtuple('Data', 'X y')
def get_data(n_features=20, n_samples=2000, n_missing=100):
def generate_coordinates(m, n):
seen = set()
x, y = randint(0, m - 1), randint(0, n - 1)
while True:
seen.add((x, y))
yield (x, y)
x, y = randint(0, m - 1), randint(0, n - 1)
while (x, y) in seen:
x, y = randint(0, m - 1), randint(0, n - 1)
def make_missing(X):
coords = generate_coordinates(n_samples, n_features)
for _ in range(n_missing):
i, j = next(coords)
X[i][j] = np.nan
def split(X, y):
splitter = ShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=37)
train_index, test_index = next(splitter.split(X, y))
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
return X_train, y_train, X_test, y_test
X, y = make_regression(**{
'n_samples': n_samples,
'n_features': n_features,
'n_informative': 10,
'n_targets': 1,
'bias': 5.3,
'random_state': 37
})
make_missing(X)
X_train, y_train, X_test, y_test = split(X, y)
return Data(X_train, y_train), Data(X_test, y_test)
np.random.seed(37)
T, V = get_data()
print(f'X training shape={T.X.shape}, y training shape={T.y.shape}')
print(f'X validation shape={V.X.shape}, y validation shape={T.y.shape}')
print(f'X training missing data points {np.count_nonzero(np.isnan(T.X))}')
print(f'X validation missing data points {np.count_nonzero(np.isnan(V.X))}')
X training shape=(1600, 20), y training shape=(1600,)
X validation shape=(400, 20), y validation shape=(1600,)
X training missing data points 90
X validation missing data points 10
13.1.2. Pipeline
[2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score
imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
lasso = Lasso()
pipeline = Pipeline([
('imputer', imputer),
('scaler', scaler),
('lasso', lasso)
])
pipeline.fit(T.X, T.y)
y_preds = pipeline.predict(V.X)
print(f'{r2_score(V.y, y_preds):.5f}: r-squared')
0.99790: r-squared
13.2. Classification pipeline
13.2.1. Get data
[3]:
import numpy as np
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit
from collections import namedtuple
Data = namedtuple('Data', 'X y')
def get_data(n_features=20, n_samples=2000, n_missing=100):
def generate_coordinates(m, n):
seen = set()
x, y = randint(0, m - 1), randint(0, n - 1)
while True:
seen.add((x, y))
yield (x, y)
x, y = randint(0, m - 1), randint(0, n - 1)
while (x, y) in seen:
x, y = randint(0, m - 1), randint(0, n - 1)
def make_missing(X):
coords = generate_coordinates(n_samples, n_features)
for _ in range(n_missing):
i, j = next(coords)
X[i][j] = np.nan
def split(X, y):
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=37)
train_index, test_index = next(splitter.split(X, y))
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
return X_train, y_train, X_test, y_test
X, y = make_classification(**{
'n_samples': n_samples,
'n_features': n_features,
'n_informative': 2,
'n_redundant': 2,
'n_repeated': 0,
'n_classes': 2,
'n_clusters_per_class': 2,
'random_state': 37
})
make_missing(X)
X_train, y_train, X_test, y_test = split(X, y)
return Data(X_train, y_train), Data(X_test, y_test)
np.random.seed(37)
T, V = get_data()
print(f'X training shape={T.X.shape}, y training shape={T.y.shape}')
print(f'X validation shape={V.X.shape}, y validation shape={T.y.shape}')
print(f'X training missing data points {np.count_nonzero(np.isnan(T.X))}')
print(f'X validation missing data points {np.count_nonzero(np.isnan(V.X))}')
X training shape=(1600, 20), y training shape=(1600,)
X validation shape=(400, 20), y validation shape=(1600,)
X training missing data points 81
X validation missing data points 19
13.2.2. Pipeline
[4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score, average_precision_score
imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)
rf = RandomForestClassifier(n_estimators=100)
pipeline = Pipeline([
('imputer', imputer),
('scaler', scaler),
('pca', pca),
('rf', rf)
])
pipeline.fit(T.X, T.y)
y_preds = pipeline.predict_proba(V.X)[:,1]
print(f'{roc_auc_score(V.y, y_preds):.5f}: ROC AUC')
print(f'{average_precision_score(V.y, y_preds):.5f}: PR AUC')
0.95290: ROC AUC
0.94265: PR AUC