14. Validation
14.1. Splitting data
[1]:
import numpy as np
X = np.array([
[1, 2],
[3, 4],
[5, 6],
[7, 8],
[9, 9],
[7, 7]
])
y = np.array([0, 0, 0, 1, 1, 1])
groups = np.array([0, 1, 2, 2, 1, 0])
14.1.1. Group k-fold
[2]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=3)
for train_index, test_index in gkf.split(X, y, groups):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [1 2 3 4], TEST: [0 5]
14.1.2. Group shuffle split
[3]:
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=3, random_state=37)
for train_index, test_index in gss.split(X, y, groups):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 1 4 5], TEST: [2 3]
TRAIN: [0 2 3 5], TEST: [1 4]
14.1.3. Leave one group out
[4]:
from sklearn.model_selection import LeaveOneGroupOut
logo = LeaveOneGroupOut()
for train_index, test_index in logo.split(X, y, groups):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 2 3 4], TEST: [0 5]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [0 1 4 5], TEST: [2 3]
14.1.4. Leave p-groups out
[5]:
from sklearn.model_selection import LeavePGroupsOut
lpgo = LeavePGroupsOut(n_groups=2)
for train_index, test_index in lpgo.split(X, y, groups):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [2 3], TEST: [0 1 4 5]
TRAIN: [1 4], TEST: [0 2 3 5]
TRAIN: [0 5], TEST: [1 2 3 4]
14.1.5. Leave one out
[6]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
for train_index, test_index in loo.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 2 3 4 5], TEST: [0]
TRAIN: [0 2 3 4 5], TEST: [1]
TRAIN: [0 1 3 4 5], TEST: [2]
TRAIN: [0 1 2 4 5], TEST: [3]
TRAIN: [0 1 2 3 5], TEST: [4]
TRAIN: [0 1 2 3 4], TEST: [5]
14.1.6. Leave p-out
[7]:
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=3)
for train_index, test_index in lpo.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [3 4 5], TEST: [0 1 2]
TRAIN: [2 4 5], TEST: [0 1 3]
TRAIN: [2 3 5], TEST: [0 1 4]
TRAIN: [2 3 4], TEST: [0 1 5]
TRAIN: [1 4 5], TEST: [0 2 3]
TRAIN: [1 3 5], TEST: [0 2 4]
TRAIN: [1 3 4], TEST: [0 2 5]
TRAIN: [1 2 5], TEST: [0 3 4]
TRAIN: [1 2 4], TEST: [0 3 5]
TRAIN: [1 2 3], TEST: [0 4 5]
TRAIN: [0 4 5], TEST: [1 2 3]
TRAIN: [0 3 5], TEST: [1 2 4]
TRAIN: [0 3 4], TEST: [1 2 5]
TRAIN: [0 2 5], TEST: [1 3 4]
TRAIN: [0 2 4], TEST: [1 3 5]
TRAIN: [0 2 3], TEST: [1 4 5]
TRAIN: [0 1 5], TEST: [2 3 4]
TRAIN: [0 1 4], TEST: [2 3 5]
TRAIN: [0 1 3], TEST: [2 4 5]
TRAIN: [0 1 2], TEST: [3 4 5]
14.1.7. K-fold
[8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2, shuffle=True, random_state=37)
for train_index, test_index in kf.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [3 4 5], TEST: [0 1 2]
TRAIN: [0 1 2], TEST: [3 4 5]
14.1.8. Stratified k-fold
[9]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=37)
for train_index, test_index in skf.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 3 4], TEST: [1 2 5]
TRAIN: [1 2 5], TEST: [0 3 4]
14.1.9. Shuffle split
[10]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=2, random_state=37)
for train_index, test_index in ss.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 0 5 4 3], TEST: [2]
TRAIN: [1 5 4 3 2], TEST: [0]
14.1.10. Stratified shuffle split
[11]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = ShuffleSplit(n_splits=2, random_state=37)
for train_index, test_index in sss.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 0 5 4 3], TEST: [2]
TRAIN: [1 5 4 3 2], TEST: [0]
14.1.11. Predefined split
[12]:
from sklearn.model_selection import PredefinedSplit
test_fold = np.array([1, 1, 0, 1, 1, 0])
ps = PredefinedSplit(test_fold)
for train_index, test_index in ps.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 3 4], TEST: [2 5]
TRAIN: [2 5], TEST: [0 1 3 4]
[13]:
test_fold = np.array([0, 1, 2, 0, 1, 2])
ps = PredefinedSplit(test_fold)
for train_index, test_index in ps.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [1 2 4 5], TEST: [0 3]
TRAIN: [0 2 3 5], TEST: [1 4]
TRAIN: [0 1 3 4], TEST: [2 5]
14.1.12. Customized splitter
Here is a customized splitter that resembles the PredefinedSplit
class. In the PredefinedSplit
class, the function get_n_splits()
will return 2. However, sometimes, you want only 1 split (e.g. for testing and validating where the validating fold is used for hyperparameter tuning).
[14]:
class SimpleSplitter:
def __init__(self, test_fold):
self.test_fold = test_fold
def get_n_splits(self, X=None, y=None, groups=None):
return 1
def split(self, X=None, y=None, groups=None):
tr_index = np.array([i for i, v in enumerate(self.test_fold) if v == 0])
te_index = np.array([i for i, v in enumerate(self.test_fold) if v == 1])
yield tr_index, te_index
[15]:
simple_splitter = SimpleSplitter([0, 0, 1, 0, 0, 1])
for train_index, test_index in simple_splitter.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0 1 3 4], TEST: [2 5]
Here is another splitter, where we have one-versus-rest
folds. The training fold will stay constant but the testing ones will change. Note in this customized splitter, we provide the index (not 1 or 0 flags corresponding to indexes) to the splitter.
[16]:
class OvrSplitter:
def __init__(self, train_fold, test_folds):
self.train_fold = train_fold
self.test_folds = test_folds
def get_n_splits(self, X=None, y=None, groups=None):
return len(test_folds)
def split(self, X=None, y=None, groups=None):
for test_fold in self.test_folds:
yield self.train_fold, test_fold
[17]:
ovr_splitter = OvrSplitter([0, 1], [[2, 3], [4, 5]])
for train_index, test_index in ovr_splitter.split(X, y):
print(f'TRAIN: {train_index}, TEST: {test_index}')
TRAIN: [0, 1], TEST: [2, 3]
TRAIN: [0, 1], TEST: [4, 5]
14.2. K-fold cross validation example
14.2.1. Data
[18]:
import numpy as np
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit
from collections import namedtuple
def get_data(n_features=20, n_samples=2000, n_missing=100):
def generate_coordinates(m, n):
seen = set()
x, y = randint(0, m - 1), randint(0, n - 1)
while True:
seen.add((x, y))
yield (x, y)
x, y = randint(0, m - 1), randint(0, n - 1)
while (x, y) in seen:
x, y = randint(0, m - 1), randint(0, n - 1)
def make_missing(X):
coords = generate_coordinates(n_samples, n_features)
for _ in range(n_missing):
i, j = next(coords)
X[i][j] = np.nan
X, y = make_classification(**{
'n_samples': n_samples,
'n_features': n_features,
'n_informative': 2,
'n_redundant': 2,
'n_repeated': 0,
'n_classes': 2,
'n_clusters_per_class': 2,
'random_state': 37
})
make_missing(X)
return X, y
np.random.seed(37)
X, y = get_data()
14.2.2. Pipeline
[19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score, average_precision_score
def get_rf_pipeline():
imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)
rf = RandomForestClassifier(n_estimators=100)
pipeline = Pipeline([
('imputer', imputer),
('scaler', scaler),
('pca', pca),
('rf', rf)
])
return pipeline
def get_lr_pipeline():
imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
lr = LogisticRegression(penalty='l1', solver='liblinear')
pipeline = Pipeline([
('imputer', imputer),
('scaler', scaler),
('lr', lr)
])
return pipeline
14.2.3. Validation
[20]:
import pandas as pd
def do_validation(train_index, test_index, X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
rf = get_rf_pipeline()
rf.fit(X_train, y_train)
y_preds = rf.predict_proba(X_test)[:,1]
rf_roc, rf_pr = roc_auc_score(y_test, y_preds), average_precision_score(y_test, y_preds)
lr = get_lr_pipeline()
lr.fit(X_train, y_train)
y_preds = lr.predict_proba(X_test)[:,1]
lr_roc, lr_pr = roc_auc_score(y_test, y_preds), average_precision_score(y_test, y_preds)
return rf_roc, lr_roc, rf_pr, lr_pr
kf = KFold(n_splits=10, shuffle=True, random_state=37)
results = [do_validation(train_index, test_index, X, y)
for train_index, test_index in kf.split(X, y)]
df = pd.DataFrame(results, columns=['rf_roc', 'lr_roc', 'rf_pr', 'lr_pr'])
df
[20]:
rf_roc | lr_roc | rf_pr | lr_pr | |
---|---|---|---|---|
0 | 0.950821 | 0.947516 | 0.942587 | 0.908969 |
1 | 0.961735 | 0.963085 | 0.944504 | 0.939772 |
2 | 0.982785 | 0.976779 | 0.983521 | 0.976502 |
3 | 0.962316 | 0.971975 | 0.948128 | 0.966669 |
4 | 0.975946 | 0.956924 | 0.981051 | 0.950014 |
5 | 0.964668 | 0.971174 | 0.962890 | 0.976565 |
6 | 0.971955 | 0.969451 | 0.938301 | 0.944873 |
7 | 0.964643 | 0.955729 | 0.972513 | 0.944792 |
8 | 0.992299 | 0.990599 | 0.994892 | 0.990050 |
9 | 0.940694 | 0.940994 | 0.943322 | 0.927838 |
[21]:
df.mean()
[21]:
rf_roc 0.966786
lr_roc 0.964423
rf_pr 0.961171
lr_pr 0.952604
dtype: float64