6. Classification
6.1. Generate data
[1]:
import numpy as np
from numpy.random import normal
from scipy.stats import binom
from collections import namedtuple
Data = namedtuple('Data', 'X y')
np.random.seed(37)
def get_data(N=10000, M=10):
X = np.hstack([normal(0.0, 1.0, N).reshape(N, 1) for _ in range(M)])
w = np.array([w + 1.0 for w in range(M)])
z = np.dot(X, w) + normal(0.0, 0.2, N)
p = 1.0 / (1.0 + np.exp(-z))
y = binom.rvs(1, p)
return Data(X, y)
# training
T = get_data()
# validation
V = get_data(N=1000)
6.2. Types of classifiers
6.2.1. Logistic regression
[2]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=False)
lr.fit(T.X, T.y)
[2]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
6.2.2. Gaussian Naive Bayes
[3]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(T.X, T.y)
[3]:
GaussianNB(priors=None, var_smoothing=1e-09)
6.2.3. Linear Discriminant Analysis
[4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(T.X, T.y)
[4]:
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
solver='svd', store_covariance=False, tol=0.0001)
6.2.4. Quadratic Discriminant Analysis
[5]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(T.X, T.y)
[5]:
QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
store_covariance=False, tol=0.0001)
6.2.5. Neural network, Multi-Layer Perceptron (MLP)
[6]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000, alpha=0.01)
mlp.fit(T.X, T.y)
[6]:
MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=1000, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=None, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False)
6.2.6. Decision tree
[7]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(T.X, T.y)
[7]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
6.2.7. Linear Support Vector Machine (SVM)
[8]:
from sklearn.svm import NuSVC
svm = NuSVC(gamma='auto', probability=True, random_state=37)
svm.fit(T.X, T.y)
[8]:
NuSVC(cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=-1, nu=0.5, probability=True, random_state=37, shrinking=True,
tol=0.001, verbose=False)
6.2.8. Stochastic gradient descient (SGD)
[9]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='log')
sgd.fit(T.X, T.y)
[9]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
random_state=None, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
6.2.9. Random Forest
[10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(T.X, T.y)
[10]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
6.2.10. AdaBoost
[11]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()
ab.fit(T.X, T.y)
[11]:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
n_estimators=50, random_state=None)
6.3. Performance
[12]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, jaccard_score, matthews_corrcoef, \
precision_score, recall_score, \
brier_score_loss, log_loss, hamming_loss, hinge_loss, zero_one_loss, \
roc_auc_score, average_precision_score
def get_scoring_functions():
cfuncs = [accuracy_score, f1_score, jaccard_score, matthews_corrcoef]
pfuncs = [brier_score_loss, log_loss, hinge_loss,
roc_auc_score, average_precision_score]
return cfuncs, pfuncs
def get_tuple_cols():
cfuncs, pfuncs = get_scoring_functions()
return ['model'] + [f.__name__ for f in cfuncs] + [f.__name__ for f in pfuncs]
def get_scores(model_name, y_true, y_preds, y_probs):
cfuncs, pfuncs = get_scoring_functions()
cscores = {f.__name__: f(y_true, y_preds) for f in cfuncs}
pscores = {f.__name__: f(y_true, y_probs) for f in pfuncs}
d = {**cscores, **pscores}
d['model'] = model_name
return tuple([d[c] for c in get_tuple_cols()])
models = [lr, nb, lda, qda, mlp, dt, svm, sgd, rf, ab]
model_names = [type(m).__name__ for m in models]
y_preds = {type(model).__name__: model.predict(V.X) for model in models}
y_probs = {type(model).__name__: model.predict_proba(V.X)[:,1] for model in models}
scores = [get_scores(name, V.y, y_preds[name], y_probs[name]) for name in model_names]
df = pd.DataFrame(scores, columns=get_tuple_cols())
df
[12]:
model | accuracy_score | f1_score | jaccard_score | matthews_corrcoef | brier_score_loss | log_loss | hinge_loss | roc_auc_score | average_precision_score | |
---|---|---|---|---|---|---|---|---|---|---|
0 | LogisticRegression | 0.978 | 0.976842 | 0.954733 | 0.956086 | 0.016632 | 0.054495 | 0.569966 | 0.998527 | 0.998359 |
1 | GaussianNB | 0.968 | 0.966030 | 0.934292 | 0.935792 | 0.079076 | 0.294971 | 0.770754 | 0.997130 | 0.996774 |
2 | LinearDiscriminantAnalysis | 0.971 | 0.969442 | 0.940695 | 0.942007 | 0.038022 | 0.149966 | 0.653551 | 0.998378 | 0.998193 |
3 | QuadraticDiscriminantAnalysis | 0.978 | 0.976842 | 0.954733 | 0.956086 | 0.038260 | 0.150470 | 0.653850 | 0.998246 | 0.998042 |
4 | MLPClassifier | 0.971 | 0.969506 | 0.940816 | 0.942097 | 0.019656 | 0.062244 | 0.569476 | 0.997579 | 0.997283 |
5 | DecisionTreeClassifier | 0.814 | 0.803797 | 0.671958 | 0.627096 | 0.186000 | 6.424290 | 0.716000 | 0.813810 | 0.735136 |
6 | NuSVC | 0.969 | 0.967334 | 0.936735 | 0.937996 | 0.020646 | 0.067079 | 0.578227 | 0.997543 | 0.997283 |
7 | SGDClassifier | 0.973 | 0.971307 | 0.944215 | 0.945813 | 0.018218 | 0.061125 | 0.576176 | 0.998270 | 0.998065 |
8 | RandomForestClassifier | 0.926 | 0.921610 | 0.854617 | 0.851564 | 0.076666 | 0.271507 | 0.744460 | 0.981208 | 0.978537 |
9 | AdaBoostClassifier | 0.921 | 0.916754 | 0.846304 | 0.841738 | 0.228336 | 0.649722 | 1.007489 | 0.979775 | 0.978109 |