9. Dimensionality Reduction
9.1. Generate data
[1]:
import numpy as np
from numpy.random import normal
from collections import namedtuple
Data = namedtuple('Data', 'X y')
np.random.seed(37)
def get_data(N=1000):
x0 = normal(1, 1, N)
x1 = normal(3 + 2 * x0, 1, N)
x2 = normal(2.5 + 0.8 * x1, N)
x3 = normal(5, 2, N)
x4 = normal(0.8 + 0.8 * x3, N)
x5 = normal(10, 5, N)
X0 = [x0, x1, x2, x3, x4, x5]
X0 = np.hstack([x.reshape(-1, 1) for x in X0])
x0 = normal(1.5, 1, N)
x1 = normal(7.3 + 2 * x0, 1, N)
x2 = normal(3.6 + 0.9 * x1, N)
x3 = normal(3.3, 2.5, N)
x4 = normal(0.9 - 0.4 * x3, N)
x5 = normal(10, 5, N)
X1 = [x0, x1, x2, x3, x4, x5]
X1 = np.hstack([x.reshape(-1, 1) for x in X1])
X = np.vstack([X0, X1])
y = np.hstack([np.full((1, X0.shape[0]), 0), np.full((1, X1.shape[0]), 1)])[0]
return Data(X, y)
# training data
T = get_data()
# validation data
V = get_data(N=1000)
9.2. Types of dimensionality reductions
9.2.1. Principal Component Analysis (PCA)
[2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)
pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
pipeline.fit(T.X)
[2]:
Pipeline(memory=None,
steps=[('scaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('pca',
PCA(copy=True, iterated_power='auto', n_components=3,
random_state=37, svd_solver='auto', tol=0.0,
whiten=False))],
verbose=False)
9.2.2. Kernel PCA
[3]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=3, random_state=37, kernel='linear')
kpca.fit(T.X)
[3]:
KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
fit_inverse_transform=False, gamma=None, kernel='linear',
kernel_params=None, max_iter=None, n_components=3, n_jobs=None,
random_state=37, remove_zero_eig=False, tol=0)
9.2.3. Singular Value Decomposition (SVD)
[4]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3, random_state=37)
svd.fit(T.X)
[4]:
TruncatedSVD(algorithm='randomized', n_components=3, n_iter=5, random_state=37,
tol=0.0)
9.2.4. Factor analysis
[5]:
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2, random_state=37)
fa.fit(T.X)
[5]:
FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=2,
noise_variance_init=None, random_state=37,
svd_method='randomized', tol=0.01)
9.2.5. Non-Negative Matrix Factorization (NMF)
[6]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=3, random_state=37)
nmf.fit(np.abs(T.X))
[6]:
NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
n_components=3, random_state=37, shuffle=False, solver='cd', tol=0.0001,
verbose=0)
9.3. Performance
[7]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
def get_model(name, k):
if 'pca' == name:
return PCA(n_components=k, random_state=37)
elif 'kpca' == name:
return KernelPCA(n_components=k, random_state=37, kernel='linear')
elif 'fa' == name:
return FactorAnalysis(n_components=k, random_state=37)
elif 'svd' == name:
return TruncatedSVD(n_components=k, random_state=37)
else:
raise Exception(f'Unknown model: {name}')
def get_score(name, k, T, V):
model = get_model(name, k)
model.fit(T.X)
X = model.transform(T.X)
km = KMeans(n_clusters=2, random_state=37)
km.fit(X)
X = model.transform(V.X)
y_preds = km.predict(X)
score = silhouette_score(X, y_preds)
return score
def get_model_scores(name, T, V, max_k=6):
model_name = type(get_model(name, 2)).__name__
scores = [get_score(name, k, T, V) for k in range(1, max_k)]
return tuple([model_name] + scores)
max_k = 6
names = ['pca', 'kpca', 'fa', 'svd']
columns = ['model'] + [f'silhouette_k_{k}' for k in range(1, max_k)]
df = pd.DataFrame([get_model_scores(name, T, V, max_k=6) for name in names], columns=columns)
df
[7]:
model | silhouette_k_1 | silhouette_k_2 | silhouette_k_3 | silhouette_k_4 | silhouette_k_5 | |
---|---|---|---|---|---|---|
0 | PCA | 0.552847 | 0.309375 | 0.309366 | 0.309361 | 0.309414 |
1 | KernelPCA | 0.552847 | 0.309375 | 0.309366 | 0.309361 | 0.309414 |
2 | FactorAnalysis | 0.589545 | 0.306878 | 0.219110 | 0.170068 | 0.133372 |
3 | TruncatedSVD | 0.552842 | 0.309375 | 0.309368 | 0.309362 | 0.309414 |