13. Pipelines

13.1. Regression pipeline

13.1.1. Get data

[1]:
import numpy as np
from random import randint
from sklearn.datasets import make_regression
from sklearn.model_selection import ShuffleSplit
from collections import namedtuple

Data = namedtuple('Data', 'X y')

def get_data(n_features=20, n_samples=2000, n_missing=100):
    def generate_coordinates(m, n):
        seen = set()

        x, y = randint(0, m - 1), randint(0, n - 1)

        while True:
            seen.add((x, y))
            yield (x, y)
            x, y = randint(0, m - 1), randint(0, n - 1)
            while (x, y) in seen:
                x, y = randint(0, m - 1), randint(0, n - 1)

    def make_missing(X):
        coords = generate_coordinates(n_samples, n_features)

        for _ in range(n_missing):
            i, j = next(coords)
            X[i][j] = np.nan

    def split(X, y):
        splitter = ShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=37)
        train_index, test_index = next(splitter.split(X, y))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        return X_train, y_train, X_test, y_test

    X, y = make_regression(**{
        'n_samples': n_samples,
        'n_features': n_features,
        'n_informative': 10,
        'n_targets': 1,
        'bias': 5.3,
        'random_state': 37
    })

    make_missing(X)

    X_train, y_train, X_test, y_test = split(X, y)
    return Data(X_train, y_train), Data(X_test, y_test)

np.random.seed(37)

T, V = get_data()

print(f'X training shape={T.X.shape}, y training shape={T.y.shape}')
print(f'X validation shape={V.X.shape}, y validation shape={T.y.shape}')

print(f'X training missing data points {np.count_nonzero(np.isnan(T.X))}')
print(f'X validation missing data points {np.count_nonzero(np.isnan(V.X))}')
X training shape=(1600, 20), y training shape=(1600,)
X validation shape=(400, 20), y validation shape=(1600,)
X training missing data points 90
X validation missing data points 10

13.1.2. Pipeline

[2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score

imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
lasso = Lasso()

pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('lasso', lasso)
])

pipeline.fit(T.X, T.y)
y_preds = pipeline.predict(V.X)

print(f'{r2_score(V.y, y_preds):.5f}: r-squared')
0.99790: r-squared

13.2. Classification pipeline

13.2.1. Get data

[3]:
import numpy as np
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit
from collections import namedtuple

Data = namedtuple('Data', 'X y')

def get_data(n_features=20, n_samples=2000, n_missing=100):
    def generate_coordinates(m, n):
        seen = set()

        x, y = randint(0, m - 1), randint(0, n - 1)

        while True:
            seen.add((x, y))
            yield (x, y)
            x, y = randint(0, m - 1), randint(0, n - 1)
            while (x, y) in seen:
                x, y = randint(0, m - 1), randint(0, n - 1)

    def make_missing(X):
        coords = generate_coordinates(n_samples, n_features)

        for _ in range(n_missing):
            i, j = next(coords)
            X[i][j] = np.nan

    def split(X, y):
        splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=37)
        train_index, test_index = next(splitter.split(X, y))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        return X_train, y_train, X_test, y_test

    X, y = make_classification(**{
        'n_samples': n_samples,
        'n_features': n_features,
        'n_informative': 2,
        'n_redundant': 2,
        'n_repeated': 0,
        'n_classes': 2,
        'n_clusters_per_class': 2,
        'random_state': 37
    })

    make_missing(X)

    X_train, y_train, X_test, y_test = split(X, y)
    return Data(X_train, y_train), Data(X_test, y_test)

np.random.seed(37)

T, V = get_data()

print(f'X training shape={T.X.shape}, y training shape={T.y.shape}')
print(f'X validation shape={V.X.shape}, y validation shape={T.y.shape}')

print(f'X training missing data points {np.count_nonzero(np.isnan(T.X))}')
print(f'X validation missing data points {np.count_nonzero(np.isnan(V.X))}')
X training shape=(1600, 20), y training shape=(1600,)
X validation shape=(400, 20), y validation shape=(1600,)
X training missing data points 81
X validation missing data points 19

13.2.2. Pipeline

[4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score, average_precision_score

imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)
rf = RandomForestClassifier(n_estimators=100)

pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('pca', pca),
    ('rf', rf)
])

pipeline.fit(T.X, T.y)
y_preds = pipeline.predict_proba(V.X)[:,1]

print(f'{roc_auc_score(V.y, y_preds):.5f}: ROC AUC')
print(f'{average_precision_score(V.y, y_preds):.5f}: PR AUC')
0.95290: ROC AUC
0.94265: PR AUC