4. Logistic Regression
4.1. Generate data
[1]:
import numpy as np
from numpy.random import binomial, normal
from scipy.stats import bernoulli, binom
from collections import namedtuple
Data = namedtuple('Data', 'X y')
np.random.seed(37)
def get_data(N=10000):
X = np.hstack([
np.array([1 for _ in range(N)]).reshape(N, 1),
normal(0.0, 1.0, N).reshape(N, 1),
normal(0.0, 1.0, N).reshape(N, 1)
])
z = np.dot(X, np.array([1.0, 2.0, 3.0])) + normal(0.0, 0.2, N)
p = 1.0 / (1.0 + np.exp(-z))
y = binom.rvs(1, p)
return Data(X, y)
# training
T = get_data()
# validation
V = get_data(N=1000)
4.2. Types of logistic regression
4.2.1. Logistic regression with L1 penalty
[2]:
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=False)
lr1.fit(T.X, T.y)
[2]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
4.2.2. Logistic regression with L2 penalty
[3]:
lr2 = LogisticRegression(penalty='l2', solver='liblinear')
lr2.fit(T.X, T.y)
[3]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
4.3. Performance
[4]:
models = [lr1, lr2]
weights = [np.array(model.coef_).transpose() for model in models]
y_preds = [model.predict_proba(V.X)[:,1] for model in models]
4.3.1. Efron r-squared
[5]:
def efron_rsquare(y, y_pred):
n = float(len(y))
t1 = np.sum(np.power(y - y_pred, 2.0))
t2 = np.sum(np.power((y - (np.sum(y) / n)), 2.0))
return 1.0 - (t1 / t2)
[efron_rsquare(V.y, y_p) for model, y_p in zip(models, y_preds)]
[5]:
[0.6031210532127538, 0.6030840398255952]
4.3.2. McFadden r-squared
[6]:
def mcfadden_rsquare(w, X, y):
def full_log_likelihood(w, X, y):
score = np.dot(X, w).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)
def null_log_likelihood(w, X, y):
z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
score = np.dot(X, z).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)
return 1.0 - (full_log_likelihood(w, X, y) / null_log_likelihood(w, X, y))
[mcfadden_rsquare(w, T.X, T.y) for w in weights]
[6]:
[0.5628195673845076, 0.5188920058998845]
4.3.3. McFadden adjusted r-squared
[7]:
def mcfadden_adjusted_rsquare(w, X, y):
def full_log_likelihood(w, X, y):
score = np.dot(X, w).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)
def null_log_likelihood(w, X, y):
z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
score = np.dot(X, z).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)
k = float(X.shape[1])
return 1.0 - ((full_log_likelihood(w, X, y) - k) / null_log_likelihood(w, X, y))
[mcfadden_adjusted_rsquare(w, T.X, T.y) for w in weights]
[7]:
[0.5624019131203336, 0.5184493056623285]
4.3.4. McKelvey & Zavoina r-squared
[8]:
def mz_rsquare(y_pred):
return np.var(y_pred) / (np.var(y_pred) + (np.power(np.pi, 2.0) / 3.0) )
[mz_rsquare(y_pred) for y_pred in y_preds]
[8]:
[0.04114214468764824, 0.04107911565272002]
4.3.5. Count r-squared
[9]:
def count_rsquare(y, y_pred, t=0.5):
def get_num_correct(y, y_pred, t=0.5):
y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])
n = float(len(y))
num_correct = get_num_correct(y, y_pred, t)
return num_correct / n
[count_rsquare(V.y, y_pred) for y_pred in y_preds]
[9]:
[0.865, 0.865]
4.3.6. Adjust count r-squared
[10]:
def count_adjusted_rsquare(y, y_pred, t=0.5):
def get_num_correct(y, y_pred, t=0.5):
y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])
def get_count_most_freq_outcome(y):
num_0 = 0
num_1 = 0
for p in y:
if p == 1.0:
num_1 += 1
else:
num_0 += 1
return float(max(num_0, num_1))
correct = get_num_correct(y, y_pred, t)
total = float(len(y))
n = get_count_most_freq_outcome(y)
return (correct - n) / (total - n)
[count_adjusted_rsquare(V.y, y_pred) for y_pred in y_preds]
[10]:
[0.6723300970873787, 0.6723300970873787]