# 7. Clustering

## 7.1. Generate data

[1]:

import numpy as np
from numpy.random import normal
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(means, variances, labels, N=1000):
def get_X(sample_means, sample_variances, N):
return np.hstack([normal(m, v, N).reshape(-1, 1) for m, v in zip(sample_means, sample_variances)])

def get_y(label, N):
return np.full(N, label, dtype=np.int)

X = np.vstack([get_X(m, v, N) for m, v in zip(means, variances)])
y = np.hstack([get_y(label, N) for label in labels])

return Data(X, y)

# training
T = get_data(means=[[5.0, 5.0], [6.0, 6.0]], variances=[[1.0, 1.0], [1.0, 1.0]], labels=[0, 1])

# validation
V = get_data(means=[[5.0, 5.0], [6.0, 6.0]], variances=[[1.0, 1.0], [1.0, 1.0]], labels=[0, 1])


## 7.2. Types of clustering

### 7.2.1. K-means

[2]:

from sklearn.cluster import KMeans

km = KMeans(n_clusters=2, random_state=37)
km.fit(T.X)

[2]:

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=37, tol=0.0001, verbose=0)


### 7.2.2. Affinity propagation

[3]:

from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation()
ap.fit(T.X)

[3]:

AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
damping=0.5, max_iter=200, preference=None, verbose=False)


### 7.2.3. Mean-shift

[4]:

from sklearn.cluster import MeanShift

ms = MeanShift()
ms.fit(T.X)

[4]:

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
n_jobs=None, seeds=None)


### 7.2.4. Spectral

[5]:

from sklearn.cluster import SpectralClustering

sc = SpectralClustering(n_clusters=2, random_state=37)
sc.fit(T.X)

[5]:

SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
eigen_solver=None, eigen_tol=0.0, gamma=1.0,
kernel_params=None, n_clusters=2, n_init=10, n_jobs=None,
n_neighbors=10, random_state=37)


### 7.2.5. Agglomerative

[6]:

from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=2)
ac.fit(T.X)

[6]:

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
connectivity=None, distance_threshold=None,
pooling_func='deprecated')


### 7.2.6. DBSCAN

[7]:

from sklearn.cluster import DBSCAN

db = DBSCAN()
db.fit(T.X)

[7]:

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
metric_params=None, min_samples=5, n_jobs=None, p=None)


### 7.2.7. OPTICS

[8]:

from sklearn.cluster import OPTICS

op = OPTICS()
op.fit(T.X)

[8]:

OPTICS(algorithm='auto', cluster_method='xi', eps=None, leaf_size=30,
max_eps=inf, metric='minkowski', metric_params=None,
min_cluster_size=None, min_samples=5, n_jobs=None, p=2,
predecessor_correction=True, xi=0.05)


### 7.2.8. Birch

[9]:

from sklearn.cluster import Birch

birch = Birch(n_clusters=2)
birch.fit(T.X)

[9]:

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=2,
threshold=0.5)


## 7.3. Performance

[10]:

import pandas as pd
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, \
completeness_score, homogeneity_score, silhouette_score, \
v_measure_score

def get_scoring_functions():
# grouping score functions
gfuncs = [davies_bouldin_score, silhouette_score]
# cluster assignment score functions
afuncs = [calinski_harabasz_score, completeness_score, homogeneity_score, v_measure_score]
return gfuncs, afuncs

def get_tuple_cols():
gfuncs, afuncs = get_scoring_functions()
return ['model'] + [f.__name__ for f in gfuncs] + [f.__name__ for f in afuncs]

def get_scores(model_name, X, y_true, y_pred):
def do_scoring(f, y_true, y_pred):
try:
return f(y_true, y_pred)
except:
return f(y_true.reshape(-1, 1), y_pred)

gfuncs, afuncs = get_scoring_functions()

gscores = {f.__name__: f(X, y_pred) for f in gfuncs}
ascores = {f.__name__: do_scoring(f, y_true, y_pred) for f in afuncs}

d = {**gscores, **ascores}
d['model'] = model_name

return tuple([d[c] for c in get_tuple_cols()])

def get_predictions(model, X):
fit_predict_models = ['SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'OPTICS']
if type(model).__name__ not in fit_predict_models:
return model.predict(X)
else:
return model.fit_predict(X)

models = [km, sc, ac, birch]
model_names = [type(m).__name__ for m in models]

y_preds = {type(model).__name__: get_predictions(model, V.X) for model in models}

scores = [get_scores(name, V.X, V.y, y_preds[name]) for name in model_names]
df = pd.DataFrame(scores, columns=get_tuple_cols())
df

[10]:

model davies_bouldin_score silhouette_score calinski_harabasz_score completeness_score homogeneity_score v_measure_score
0 KMeans 1.107967 0.345717 659.943519 0.187573 0.187416 0.187494
1 SpectralClustering 1.108844 0.344831 664.091301 0.188626 0.188374 0.188500
2 AgglomerativeClustering 1.302874 0.286382 513.321469 0.152997 0.152944 0.152970
3 Birch 1.343323 0.290154 336.963632 0.117488 0.108257 0.112684