7. Clustering
7.1. Generate data
[1]:
import numpy as np
from numpy.random import normal
from collections import namedtuple
Data = namedtuple('Data', 'X y')
np.random.seed(37)
def get_data(means, variances, labels, N=1000):
def get_X(sample_means, sample_variances, N):
return np.hstack([normal(m, v, N).reshape(-1, 1) for m, v in zip(sample_means, sample_variances)])
def get_y(label, N):
return np.full(N, label, dtype=np.int)
X = np.vstack([get_X(m, v, N) for m, v in zip(means, variances)])
y = np.hstack([get_y(label, N) for label in labels])
return Data(X, y)
# training
T = get_data(means=[[5.0, 5.0], [6.0, 6.0]], variances=[[1.0, 1.0], [1.0, 1.0]], labels=[0, 1])
# validation
V = get_data(means=[[5.0, 5.0], [6.0, 6.0]], variances=[[1.0, 1.0], [1.0, 1.0]], labels=[0, 1])
7.2. Types of clustering
7.2.1. K-means
[2]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2, random_state=37)
km.fit(T.X)
[2]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=37, tol=0.0001, verbose=0)
7.2.2. Affinity propagation
[3]:
from sklearn.cluster import AffinityPropagation
ap = AffinityPropagation()
ap.fit(T.X)
[3]:
AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
damping=0.5, max_iter=200, preference=None, verbose=False)
7.2.3. Mean-shift
[4]:
from sklearn.cluster import MeanShift
ms = MeanShift()
ms.fit(T.X)
[4]:
MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
n_jobs=None, seeds=None)
7.2.4. Spectral
[5]:
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters=2, random_state=37)
sc.fit(T.X)
[5]:
SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
eigen_solver=None, eigen_tol=0.0, gamma=1.0,
kernel_params=None, n_clusters=2, n_init=10, n_jobs=None,
n_neighbors=10, random_state=37)
7.2.5. Agglomerative
[6]:
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=2)
ac.fit(T.X)
[6]:
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
connectivity=None, distance_threshold=None,
linkage='ward', memory=None, n_clusters=2,
pooling_func='deprecated')
7.2.6. DBSCAN
[7]:
from sklearn.cluster import DBSCAN
db = DBSCAN()
db.fit(T.X)
[7]:
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
metric_params=None, min_samples=5, n_jobs=None, p=None)
7.2.7. OPTICS
[8]:
from sklearn.cluster import OPTICS
op = OPTICS()
op.fit(T.X)
[8]:
OPTICS(algorithm='auto', cluster_method='xi', eps=None, leaf_size=30,
max_eps=inf, metric='minkowski', metric_params=None,
min_cluster_size=None, min_samples=5, n_jobs=None, p=2,
predecessor_correction=True, xi=0.05)
7.2.8. Birch
[9]:
from sklearn.cluster import Birch
birch = Birch(n_clusters=2)
birch.fit(T.X)
[9]:
Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=2,
threshold=0.5)
7.3. Performance
[10]:
import pandas as pd
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, \
completeness_score, homogeneity_score, silhouette_score, \
v_measure_score
def get_scoring_functions():
# grouping score functions
gfuncs = [davies_bouldin_score, silhouette_score]
# cluster assignment score functions
afuncs = [calinski_harabasz_score, completeness_score, homogeneity_score, v_measure_score]
return gfuncs, afuncs
def get_tuple_cols():
gfuncs, afuncs = get_scoring_functions()
return ['model'] + [f.__name__ for f in gfuncs] + [f.__name__ for f in afuncs]
def get_scores(model_name, X, y_true, y_pred):
def do_scoring(f, y_true, y_pred):
try:
return f(y_true, y_pred)
except:
return f(y_true.reshape(-1, 1), y_pred)
gfuncs, afuncs = get_scoring_functions()
gscores = {f.__name__: f(X, y_pred) for f in gfuncs}
ascores = {f.__name__: do_scoring(f, y_true, y_pred) for f in afuncs}
d = {**gscores, **ascores}
d['model'] = model_name
return tuple([d[c] for c in get_tuple_cols()])
def get_predictions(model, X):
fit_predict_models = ['SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'OPTICS']
if type(model).__name__ not in fit_predict_models:
return model.predict(X)
else:
return model.fit_predict(X)
models = [km, sc, ac, birch]
model_names = [type(m).__name__ for m in models]
y_preds = {type(model).__name__: get_predictions(model, V.X) for model in models}
scores = [get_scores(name, V.X, V.y, y_preds[name]) for name in model_names]
df = pd.DataFrame(scores, columns=get_tuple_cols())
df
[10]:
model | davies_bouldin_score | silhouette_score | calinski_harabasz_score | completeness_score | homogeneity_score | v_measure_score | |
---|---|---|---|---|---|---|---|
0 | KMeans | 1.107967 | 0.345717 | 659.943519 | 0.187573 | 0.187416 | 0.187494 |
1 | SpectralClustering | 1.108844 | 0.344831 | 664.091301 | 0.188626 | 0.188374 | 0.188500 |
2 | AgglomerativeClustering | 1.302874 | 0.286382 | 513.321469 | 0.152997 | 0.152944 | 0.152970 |
3 | Birch | 1.343323 | 0.290154 | 336.963632 | 0.117488 | 0.108257 | 0.112684 |