9. Dimensionality Reduction

9.1. Generate data

[1]:
import numpy as np
from numpy.random import normal
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(N=1000):
    x0 = normal(1, 1, N)
    x1 = normal(3 + 2 * x0, 1, N)
    x2 = normal(2.5 + 0.8 * x1, N)
    x3 = normal(5, 2, N)
    x4 = normal(0.8 + 0.8 * x3, N)
    x5 = normal(10, 5, N)

    X0 = [x0, x1, x2, x3, x4, x5]
    X0 = np.hstack([x.reshape(-1, 1) for x in X0])

    x0 = normal(1.5, 1, N)
    x1 = normal(7.3 + 2 * x0, 1, N)
    x2 = normal(3.6 + 0.9 * x1, N)
    x3 = normal(3.3, 2.5, N)
    x4 = normal(0.9 - 0.4 * x3, N)
    x5 = normal(10, 5, N)

    X1 = [x0, x1, x2, x3, x4, x5]
    X1 = np.hstack([x.reshape(-1, 1) for x in X1])

    X = np.vstack([X0, X1])
    y = np.hstack([np.full((1, X0.shape[0]), 0), np.full((1, X1.shape[0]), 1)])[0]

    return Data(X, y)

# training data
T = get_data()

# validation data
V = get_data(N=1000)

9.2. Types of dimensionality reductions

9.2.1. Principal Component Analysis (PCA)

[2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)

pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
pipeline.fit(T.X)
[2]:
Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=3,
                     random_state=37, svd_solver='auto', tol=0.0,
                     whiten=False))],
         verbose=False)

9.2.2. Kernel PCA

[3]:
from sklearn.decomposition import KernelPCA

kpca = KernelPCA(n_components=3, random_state=37, kernel='linear')
kpca.fit(T.X)
[3]:
KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
          fit_inverse_transform=False, gamma=None, kernel='linear',
          kernel_params=None, max_iter=None, n_components=3, n_jobs=None,
          random_state=37, remove_zero_eig=False, tol=0)

9.2.3. Singular Value Decomposition (SVD)

[4]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3, random_state=37)
svd.fit(T.X)
[4]:
TruncatedSVD(algorithm='randomized', n_components=3, n_iter=5, random_state=37,
             tol=0.0)

9.2.4. Factor analysis

[5]:
from sklearn.decomposition import FactorAnalysis

fa = FactorAnalysis(n_components=2, random_state=37)
fa.fit(T.X)
[5]:
FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=2,
               noise_variance_init=None, random_state=37,
               svd_method='randomized', tol=0.01)

9.2.5. Non-Negative Matrix Factorization (NMF)

[6]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=3, random_state=37)
nmf.fit(np.abs(T.X))
[6]:
NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=37, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

9.3. Performance

[7]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def get_model(name, k):
    if 'pca' == name:
        return PCA(n_components=k, random_state=37)
    elif 'kpca' == name:
        return KernelPCA(n_components=k, random_state=37, kernel='linear')
    elif 'fa' == name:
        return FactorAnalysis(n_components=k, random_state=37)
    elif 'svd' == name:
        return TruncatedSVD(n_components=k, random_state=37)
    else:
        raise Exception(f'Unknown model: {name}')

def get_score(name, k, T, V):
    model = get_model(name, k)
    model.fit(T.X)

    X = model.transform(T.X)

    km = KMeans(n_clusters=2, random_state=37)
    km.fit(X)

    X = model.transform(V.X)
    y_preds = km.predict(X)
    score = silhouette_score(X, y_preds)

    return score

def get_model_scores(name, T, V, max_k=6):
    model_name = type(get_model(name, 2)).__name__
    scores = [get_score(name, k, T, V) for k in range(1, max_k)]
    return tuple([model_name] + scores)

max_k = 6
names = ['pca', 'kpca', 'fa', 'svd']
columns = ['model'] + [f'silhouette_k_{k}' for k in range(1, max_k)]

df = pd.DataFrame([get_model_scores(name, T, V, max_k=6) for name in names], columns=columns)
df
[7]:
model silhouette_k_1 silhouette_k_2 silhouette_k_3 silhouette_k_4 silhouette_k_5
0 PCA 0.552847 0.309375 0.309366 0.309361 0.309414
1 KernelPCA 0.552847 0.309375 0.309366 0.309361 0.309414
2 FactorAnalysis 0.589545 0.306878 0.219110 0.170068 0.133372
3 TruncatedSVD 0.552842 0.309375 0.309368 0.309362 0.309414