# 9. Dimensionality Reduction

## 9.1. Generate data

[1]:

import numpy as np
from numpy.random import normal
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(N=1000):
x0 = normal(1, 1, N)
x1 = normal(3 + 2 * x0, 1, N)
x2 = normal(2.5 + 0.8 * x1, N)
x3 = normal(5, 2, N)
x4 = normal(0.8 + 0.8 * x3, N)
x5 = normal(10, 5, N)

X0 = [x0, x1, x2, x3, x4, x5]
X0 = np.hstack([x.reshape(-1, 1) for x in X0])

x0 = normal(1.5, 1, N)
x1 = normal(7.3 + 2 * x0, 1, N)
x2 = normal(3.6 + 0.9 * x1, N)
x3 = normal(3.3, 2.5, N)
x4 = normal(0.9 - 0.4 * x3, N)
x5 = normal(10, 5, N)

X1 = [x0, x1, x2, x3, x4, x5]
X1 = np.hstack([x.reshape(-1, 1) for x in X1])

X = np.vstack([X0, X1])
y = np.hstack([np.full((1, X0.shape[0]), 0), np.full((1, X1.shape[0]), 1)])[0]

return Data(X, y)

# training data
T = get_data()

# validation data
V = get_data(N=1000)


## 9.2. Types of dimensionality reductions

### 9.2.1. Principal Component Analysis (PCA)

[2]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)

pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
pipeline.fit(T.X)

[2]:

Pipeline(memory=None,
steps=[('scaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('pca',
PCA(copy=True, iterated_power='auto', n_components=3,
random_state=37, svd_solver='auto', tol=0.0,
whiten=False))],
verbose=False)


### 9.2.2. Kernel PCA

[3]:

from sklearn.decomposition import KernelPCA

kpca = KernelPCA(n_components=3, random_state=37, kernel='linear')
kpca.fit(T.X)

[3]:

KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
fit_inverse_transform=False, gamma=None, kernel='linear',
kernel_params=None, max_iter=None, n_components=3, n_jobs=None,
random_state=37, remove_zero_eig=False, tol=0)


### 9.2.3. Singular Value Decomposition (SVD)

[4]:

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3, random_state=37)
svd.fit(T.X)

[4]:

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=5, random_state=37,
tol=0.0)


### 9.2.4. Factor analysis

[5]:

from sklearn.decomposition import FactorAnalysis

fa = FactorAnalysis(n_components=2, random_state=37)
fa.fit(T.X)

[5]:

FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=2,
noise_variance_init=None, random_state=37,
svd_method='randomized', tol=0.01)


### 9.2.5. Non-Negative Matrix Factorization (NMF)

[6]:

from sklearn.decomposition import NMF

nmf = NMF(n_components=3, random_state=37)
nmf.fit(np.abs(T.X))

[6]:

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
n_components=3, random_state=37, shuffle=False, solver='cd', tol=0.0001,
verbose=0)


## 9.3. Performance

[7]:

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def get_model(name, k):
if 'pca' == name:
return PCA(n_components=k, random_state=37)
elif 'kpca' == name:
return KernelPCA(n_components=k, random_state=37, kernel='linear')
elif 'fa' == name:
return FactorAnalysis(n_components=k, random_state=37)
elif 'svd' == name:
return TruncatedSVD(n_components=k, random_state=37)
else:
raise Exception(f'Unknown model: {name}')

def get_score(name, k, T, V):
model = get_model(name, k)
model.fit(T.X)

X = model.transform(T.X)

km = KMeans(n_clusters=2, random_state=37)
km.fit(X)

X = model.transform(V.X)
y_preds = km.predict(X)
score = silhouette_score(X, y_preds)

return score

def get_model_scores(name, T, V, max_k=6):
model_name = type(get_model(name, 2)).__name__
scores = [get_score(name, k, T, V) for k in range(1, max_k)]
return tuple([model_name] + scores)

max_k = 6
names = ['pca', 'kpca', 'fa', 'svd']
columns = ['model'] + [f'silhouette_k_{k}' for k in range(1, max_k)]

df = pd.DataFrame([get_model_scores(name, T, V, max_k=6) for name in names], columns=columns)
df

[7]:

model silhouette_k_1 silhouette_k_2 silhouette_k_3 silhouette_k_4 silhouette_k_5
0 PCA 0.552847 0.309375 0.309366 0.309361 0.309414
1 KernelPCA 0.552847 0.309375 0.309366 0.309361 0.309414
2 FactorAnalysis 0.589545 0.306878 0.219110 0.170068 0.133372
3 TruncatedSVD 0.552842 0.309375 0.309368 0.309362 0.309414