# 4. Logistic Regression

## 4.1. Generate data

[1]:

import numpy as np
from numpy.random import binomial, normal
from scipy.stats import bernoulli, binom
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(N=10000):
X = np.hstack([
np.array([1 for _ in range(N)]).reshape(N, 1),
normal(0.0, 1.0, N).reshape(N, 1),
normal(0.0, 1.0, N).reshape(N, 1)
])

z = np.dot(X, np.array([1.0, 2.0, 3.0])) + normal(0.0, 0.2, N)
p = 1.0 / (1.0 + np.exp(-z))
y = binom.rvs(1, p)
return Data(X, y)

# training
T = get_data()

# validation
V = get_data(N=1000)


## 4.2. Types of logistic regression

### 4.2.1. Logistic regression with L1 penalty

[2]:

from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=False)
lr1.fit(T.X, T.y)

[2]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)


### 4.2.2. Logistic regression with L2 penalty

[3]:

lr2 = LogisticRegression(penalty='l2', solver='liblinear')
lr2.fit(T.X, T.y)

[3]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)


## 4.3. Performance

[4]:

models = [lr1, lr2]
weights = [np.array(model.coef_).transpose() for model in models]
y_preds = [model.predict_proba(V.X)[:,1] for model in models]


### 4.3.1. Efron r-squared

[5]:

def efron_rsquare(y, y_pred):
n = float(len(y))
t1 = np.sum(np.power(y - y_pred, 2.0))
t2 = np.sum(np.power((y - (np.sum(y) / n)), 2.0))

return 1.0 - (t1 / t2)

[efron_rsquare(V.y, y_p) for model, y_p in zip(models, y_preds)]

[5]:

[0.6031210532127538, 0.6030840398255952]


[6]:

def mcfadden_rsquare(w, X, y):
def full_log_likelihood(w, X, y):
score = np.dot(X, w).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

def null_log_likelihood(w, X, y):
z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
score = np.dot(X, z).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

return 1.0 - (full_log_likelihood(w, X, y) / null_log_likelihood(w, X, y))

[mcfadden_rsquare(w, T.X, T.y) for w in weights]

[6]:

[0.5628195673845076, 0.5188920058998845]


[7]:

def mcfadden_adjusted_rsquare(w, X, y):
def full_log_likelihood(w, X, y):
score = np.dot(X, w).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

def null_log_likelihood(w, X, y):
z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
score = np.dot(X, z).reshape(1, X.shape[0])
return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

k = float(X.shape[1])

return 1.0 - ((full_log_likelihood(w, X, y) - k) / null_log_likelihood(w, X, y))


[7]:

[0.5624019131203336, 0.5184493056623285]


### 4.3.4. McKelvey & Zavoina r-squared

[8]:

def mz_rsquare(y_pred):
return np.var(y_pred) / (np.var(y_pred) + (np.power(np.pi, 2.0) / 3.0) )

[mz_rsquare(y_pred) for y_pred in y_preds]

[8]:

[0.04114214468764824, 0.04107911565272002]


### 4.3.5. Count r-squared

[9]:

def count_rsquare(y, y_pred, t=0.5):
def get_num_correct(y, y_pred, t=0.5):
y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])

n = float(len(y))
num_correct = get_num_correct(y, y_pred, t)

return num_correct / n

[count_rsquare(V.y, y_pred) for y_pred in y_preds]

[9]:

[0.865, 0.865]


[10]:

def count_adjusted_rsquare(y, y_pred, t=0.5):
def get_num_correct(y, y_pred, t=0.5):
y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])

def get_count_most_freq_outcome(y):
num_0 = 0
num_1 = 0
for p in y:
if p == 1.0:
num_1 += 1
else:
num_0 += 1
return float(max(num_0, num_1))

correct = get_num_correct(y, y_pred, t)
total = float(len(y))
n = get_count_most_freq_outcome(y)

return (correct - n) / (total - n)

[count_adjusted_rsquare(V.y, y_pred) for y_pred in y_preds]

[10]:

[0.6723300970873787, 0.6723300970873787]