14. Validation

14.1. Splitting data

[1]:
import numpy as np

X = np.array([
    [1, 2],
    [3, 4],
    [5, 6],
    [7, 8],
    [9, 9],
    [7, 7]
])

y = np.array([0, 0, 0, 1, 1, 1])

groups = np.array([0, 1, 2, 2, 1, 0])

14.1.1. Group k-fold

[2]:
from sklearn.model_selection import GroupKFold

gkf = GroupKFold(n_splits=3)

for train_index, test_index in gkf.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [1 2 3 4], TEST: [0 5]

14.1.2. Group shuffle split

[3]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=3, random_state=37)

for train_index, test_index in gss.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 2 3 5], TEST: [1 4]

14.1.3. Leave one group out

[4]:
from sklearn.model_selection import LeaveOneGroupOut

logo = LeaveOneGroupOut()

for train_index, test_index in logo.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 2 3 4], TEST: [0 5]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [0 1 4 5], TEST: [2 3]

14.1.4. Leave p-groups out

[5]:
from sklearn.model_selection import LeavePGroupsOut

lpgo = LeavePGroupsOut(n_groups=2)

for train_index, test_index in lpgo.split(X, y, groups):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [2 3], TEST: [0 1 4 5]
TRAIN: [1 4], TEST: [0 2 3 5]
TRAIN: [0 5], TEST: [1 2 3 4]

14.1.5. Leave one out

[6]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()

for train_index, test_index in loo.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 2 3 4 5], TEST: [0]
TRAIN: [0 2 3 4 5], TEST: [1]
TRAIN: [0 1 3 4 5], TEST: [2]
TRAIN: [0 1 2 4 5], TEST: [3]
TRAIN: [0 1 2 3 5], TEST: [4]
TRAIN: [0 1 2 3 4], TEST: [5]

14.1.6. Leave p-out

[7]:
from sklearn.model_selection import LeavePOut

lpo = LeavePOut(p=3)

for train_index, test_index in lpo.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [3 4 5], TEST: [0 1 2]
TRAIN: [2 4 5], TEST: [0 1 3]
TRAIN: [2 3 5], TEST: [0 1 4]
TRAIN: [2 3 4], TEST: [0 1 5]
TRAIN: [1 4 5], TEST: [0 2 3]
TRAIN: [1 3 5], TEST: [0 2 4]
TRAIN: [1 3 4], TEST: [0 2 5]
TRAIN: [1 2 5], TEST: [0 3 4]
TRAIN: [1 2 4], TEST: [0 3 5]
TRAIN: [1 2 3], TEST: [0 4 5]
TRAIN: [0 4 5], TEST: [1 2 3]
TRAIN: [0 3 5], TEST: [1 2 4]
TRAIN: [0 3 4], TEST: [1 2 5]
TRAIN: [0 2 5], TEST: [1 3 4]
TRAIN: [0 2 4], TEST: [1 3 5]
TRAIN: [0 2 3], TEST: [1 4 5]
TRAIN: [0 1 5], TEST: [2 3 4]
TRAIN: [0 1 4], TEST: [2 3 5]
TRAIN: [0 1 3], TEST: [2 4 5]
TRAIN: [0 1 2], TEST: [3 4 5]

14.1.7. K-fold

[8]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=2, shuffle=True, random_state=37)

for train_index, test_index in kf.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [3 4 5], TEST: [0 1 2]
TRAIN: [0 1 2], TEST: [3 4 5]

14.1.8. Stratified k-fold

[9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=37)

for train_index, test_index in skf.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 3 4], TEST: [1 2 5]
TRAIN: [1 2 5], TEST: [0 3 4]

14.1.9. Shuffle split

[10]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=2, random_state=37)

for train_index, test_index in ss.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 0 5 4 3], TEST: [2]
TRAIN: [1 5 4 3 2], TEST: [0]

14.1.10. Stratified shuffle split

[11]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = ShuffleSplit(n_splits=2, random_state=37)

for train_index, test_index in sss.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 0 5 4 3], TEST: [2]
TRAIN: [1 5 4 3 2], TEST: [0]

14.1.11. Predefined split

[12]:
from sklearn.model_selection import PredefinedSplit

test_fold = np.array([1, 1, 0, 1, 1, 0])

ps = PredefinedSplit(test_fold)

for train_index, test_index in ps.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 3 4], TEST: [2 5]
TRAIN: [2 5], TEST: [0 1 3 4]
[13]:
test_fold = np.array([0, 1, 2, 0, 1, 2])

ps = PredefinedSplit(test_fold)

for train_index, test_index in ps.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 2 4 5], TEST: [0 3]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [0 1 3 4], TEST: [2 5]

14.1.12. Customized splitter

Here is a customized splitter that resembles the PredefinedSplit class. In the PredefinedSplit class, the function get_n_splits() will return 2. However, sometimes, you want only 1 split (e.g. for testing and validating where the validating fold is used for hyperparameter tuning).

[14]:
class SimpleSplitter:
    def __init__(self, test_fold):
        self.test_fold = test_fold

    def get_n_splits(self, X=None, y=None, groups=None):
        return 1

    def split(self, X=None, y=None, groups=None):
        tr_index = np.array([i for i, v in enumerate(self.test_fold) if v == 0])
        te_index = np.array([i for i, v in enumerate(self.test_fold) if v == 1])

        yield tr_index, te_index
[15]:
simple_splitter = SimpleSplitter([0, 0, 1, 0, 0, 1])

for train_index, test_index in simple_splitter.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 3 4], TEST: [2 5]

Here is another splitter, where we have one-versus-rest folds. The training fold will stay constant but the testing ones will change. Note in this customized splitter, we provide the index (not 1 or 0 flags corresponding to indexes) to the splitter.

[16]:
class OvrSplitter:
    def __init__(self, train_fold, test_folds):
        self.train_fold = train_fold
        self.test_folds = test_folds

    def get_n_splits(self, X=None, y=None, groups=None):
        return len(test_folds)

    def split(self, X=None, y=None, groups=None):
        for test_fold in self.test_folds:
            yield self.train_fold, test_fold
[17]:
ovr_splitter = OvrSplitter([0, 1], [[2, 3], [4, 5]])

for train_index, test_index in ovr_splitter.split(X, y):
    print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0, 1], TEST: [2, 3]
TRAIN: [0, 1], TEST: [4, 5]

14.2. K-fold cross validation example

14.2.1. Data

[18]:
import numpy as np
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit
from collections import namedtuple

def get_data(n_features=20, n_samples=2000, n_missing=100):
    def generate_coordinates(m, n):
        seen = set()

        x, y = randint(0, m - 1), randint(0, n - 1)

        while True:
            seen.add((x, y))
            yield (x, y)
            x, y = randint(0, m - 1), randint(0, n - 1)
            while (x, y) in seen:
                x, y = randint(0, m - 1), randint(0, n - 1)

    def make_missing(X):
        coords = generate_coordinates(n_samples, n_features)

        for _ in range(n_missing):
            i, j = next(coords)
            X[i][j] = np.nan

    X, y = make_classification(**{
        'n_samples': n_samples,
        'n_features': n_features,
        'n_informative': 2,
        'n_redundant': 2,
        'n_repeated': 0,
        'n_classes': 2,
        'n_clusters_per_class': 2,
        'random_state': 37
    })

    make_missing(X)
    return X, y

np.random.seed(37)

X, y = get_data()

14.2.2. Pipeline

[19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score, average_precision_score

def get_rf_pipeline():
    imputer = IterativeImputer(missing_values=np.nan, random_state=37)
    scaler = StandardScaler()
    pca = PCA(n_components=3, random_state=37)
    rf = RandomForestClassifier(n_estimators=100)

    pipeline = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler),
        ('pca', pca),
        ('rf', rf)
    ])

    return pipeline

def get_lr_pipeline():
    imputer = IterativeImputer(missing_values=np.nan, random_state=37)
    scaler = StandardScaler()
    lr = LogisticRegression(penalty='l1', solver='liblinear')

    pipeline = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler),
        ('lr', lr)
    ])

    return pipeline

14.2.3. Validation

[20]:
import pandas as pd

def do_validation(train_index, test_index, X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rf = get_rf_pipeline()
    rf.fit(X_train, y_train)

    y_preds = rf.predict_proba(X_test)[:,1]

    rf_roc, rf_pr = roc_auc_score(y_test, y_preds), average_precision_score(y_test, y_preds)

    lr = get_lr_pipeline()
    lr.fit(X_train, y_train)

    y_preds = lr.predict_proba(X_test)[:,1]

    lr_roc, lr_pr = roc_auc_score(y_test, y_preds), average_precision_score(y_test, y_preds)

    return rf_roc, lr_roc, rf_pr, lr_pr

kf = KFold(n_splits=10, shuffle=True, random_state=37)

results = [do_validation(train_index, test_index, X, y)
           for train_index, test_index in kf.split(X, y)]
df = pd.DataFrame(results, columns=['rf_roc', 'lr_roc', 'rf_pr', 'lr_pr'])
df
[20]:
rf_roc lr_roc rf_pr lr_pr
0 0.950821 0.947516 0.942587 0.908969
1 0.961735 0.963085 0.944504 0.939772
2 0.982785 0.976779 0.983521 0.976502
3 0.962316 0.971975 0.948128 0.966669
4 0.975946 0.956924 0.981051 0.950014
5 0.964668 0.971174 0.962890 0.976565
6 0.971955 0.969451 0.938301 0.944873
7 0.964643 0.955729 0.972513 0.944792
8 0.992299 0.990599 0.994892 0.990050
9 0.940694 0.940994 0.943322 0.927838
[21]:
df.mean()
[21]:
rf_roc    0.966786
lr_roc    0.964423
rf_pr     0.961171
lr_pr     0.952604
dtype: float64