7. Clustering

7.1. Generate data

[1]:
import numpy as np
from numpy.random import normal
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(means, variances, labels, N=1000):
    def get_X(sample_means, sample_variances, N):
        return np.hstack([normal(m, v, N).reshape(-1, 1) for m, v in zip(sample_means, sample_variances)])

    def get_y(label, N):
        return np.full(N, label, dtype=np.int)

    X = np.vstack([get_X(m, v, N) for m, v in zip(means, variances)])
    y = np.hstack([get_y(label, N) for label in labels])

    return Data(X, y)

# training
T = get_data(means=[[5.0, 5.0], [6.0, 6.0]], variances=[[1.0, 1.0], [1.0, 1.0]], labels=[0, 1])

# validation
V = get_data(means=[[5.0, 5.0], [6.0, 6.0]], variances=[[1.0, 1.0], [1.0, 1.0]], labels=[0, 1])

7.2. Types of clustering

7.2.1. K-means

[2]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2, random_state=37)
km.fit(T.X)
[2]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=37, tol=0.0001, verbose=0)

7.2.2. Affinity propagation

[3]:
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation()
ap.fit(T.X)
[3]:
AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
                    damping=0.5, max_iter=200, preference=None, verbose=False)

7.2.3. Mean-shift

[4]:
from sklearn.cluster import MeanShift

ms = MeanShift()
ms.fit(T.X)
[4]:
MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
          n_jobs=None, seeds=None)

7.2.4. Spectral

[5]:
from sklearn.cluster import SpectralClustering

sc = SpectralClustering(n_clusters=2, random_state=37)
sc.fit(T.X)
[5]:
SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
                   eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                   kernel_params=None, n_clusters=2, n_init=10, n_jobs=None,
                   n_neighbors=10, random_state=37)

7.2.5. Agglomerative

[6]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=2)
ac.fit(T.X)
[6]:
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=2,
                        pooling_func='deprecated')

7.2.6. DBSCAN

[7]:
from sklearn.cluster import DBSCAN

db = DBSCAN()
db.fit(T.X)
[7]:
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

7.2.7. OPTICS

[8]:
from sklearn.cluster import OPTICS

op = OPTICS()
op.fit(T.X)
[8]:
OPTICS(algorithm='auto', cluster_method='xi', eps=None, leaf_size=30,
       max_eps=inf, metric='minkowski', metric_params=None,
       min_cluster_size=None, min_samples=5, n_jobs=None, p=2,
       predecessor_correction=True, xi=0.05)

7.2.8. Birch

[9]:
from sklearn.cluster import Birch

birch = Birch(n_clusters=2)
birch.fit(T.X)
[9]:
Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=2,
      threshold=0.5)

7.3. Performance

[10]:
import pandas as pd
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, \
    completeness_score, homogeneity_score, silhouette_score, \
    v_measure_score

def get_scoring_functions():
    # grouping score functions
    gfuncs = [davies_bouldin_score, silhouette_score]
    # cluster assignment score functions
    afuncs = [calinski_harabasz_score, completeness_score, homogeneity_score, v_measure_score]
    return gfuncs, afuncs

def get_tuple_cols():
    gfuncs, afuncs = get_scoring_functions()
    return ['model'] + [f.__name__ for f in gfuncs] + [f.__name__ for f in afuncs]

def get_scores(model_name, X, y_true, y_pred):
    def do_scoring(f, y_true, y_pred):
        try:
            return f(y_true, y_pred)
        except:
            return f(y_true.reshape(-1, 1), y_pred)

    gfuncs, afuncs = get_scoring_functions()

    gscores = {f.__name__: f(X, y_pred) for f in gfuncs}
    ascores = {f.__name__: do_scoring(f, y_true, y_pred) for f in afuncs}

    d = {**gscores, **ascores}
    d['model'] = model_name

    return tuple([d[c] for c in get_tuple_cols()])

def get_predictions(model, X):
    fit_predict_models = ['SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'OPTICS']
    if type(model).__name__ not in fit_predict_models:
        return model.predict(X)
    else:
        return model.fit_predict(X)

models = [km, sc, ac, birch]
model_names = [type(m).__name__ for m in models]

y_preds = {type(model).__name__: get_predictions(model, V.X) for model in models}

scores = [get_scores(name, V.X, V.y, y_preds[name]) for name in model_names]
df = pd.DataFrame(scores, columns=get_tuple_cols())
df
[10]:
model davies_bouldin_score silhouette_score calinski_harabasz_score completeness_score homogeneity_score v_measure_score
0 KMeans 1.107967 0.345717 659.943519 0.187573 0.187416 0.187494
1 SpectralClustering 1.108844 0.344831 664.091301 0.188626 0.188374 0.188500
2 AgglomerativeClustering 1.302874 0.286382 513.321469 0.152997 0.152944 0.152970
3 Birch 1.343323 0.290154 336.963632 0.117488 0.108257 0.112684