4. Logistic Regression

4.1. Generate data

[1]:

import numpy as np
from numpy.random import binomial, normal
from scipy.stats import bernoulli, binom
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(N=10000):
    X = np.hstack([
        np.array([1 for _ in range(N)]).reshape(N, 1),
        normal(0.0, 1.0, N).reshape(N, 1),
        normal(0.0, 1.0, N).reshape(N, 1)
    ])

    z = np.dot(X, np.array([1.0, 2.0, 3.0])) + normal(0.0, 0.2, N)
    p = 1.0 / (1.0 + np.exp(-z))
    y = binom.rvs(1, p)
    return Data(X, y)

# training
T = get_data()

# validation
V = get_data(N=1000)

4.2. Types of logistic regression

4.2.1. Logistic regression with L1 penalty

[2]:

from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=False)
lr1.fit(T.X, T.y)

[2]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

4.2.2. Logistic regression with L2 penalty

[3]:

lr2 = LogisticRegression(penalty='l2', solver='liblinear')
lr2.fit(T.X, T.y)

[3]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

4.3. Performance

[4]:

models = [lr1, lr2]
weights = [np.array(model.coef_).transpose() for model in models]
y_preds = [model.predict_proba(V.X)[:,1] for model in models]

4.3.1. Efron r-squared

[5]:

def efron_rsquare(y, y_pred):
    n = float(len(y))
    t1 = np.sum(np.power(y - y_pred, 2.0))
    t2 = np.sum(np.power((y - (np.sum(y) / n)), 2.0))

    return 1.0 - (t1 / t2)

[efron_rsquare(V.y, y_p) for model, y_p in zip(models, y_preds)]

[5]:

[0.6031210532127538, 0.6030840398255952]

4.3.2. McFadden r-squared

[6]:

def mcfadden_rsquare(w, X, y):
    def full_log_likelihood(w, X, y):
        score = np.dot(X, w).reshape(1, X.shape[0])
        return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

    def null_log_likelihood(w, X, y):
        z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
        score = np.dot(X, z).reshape(1, X.shape[0])
        return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

    return 1.0 - (full_log_likelihood(w, X, y) / null_log_likelihood(w, X, y))

[mcfadden_rsquare(w, T.X, T.y) for w in weights]

[6]:

[0.5628195673845076, 0.5188920058998845]

4.3.3. McFadden adjusted r-squared

[7]:

def mcfadden_adjusted_rsquare(w, X, y):
    def full_log_likelihood(w, X, y):
        score = np.dot(X, w).reshape(1, X.shape[0])
        return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

    def null_log_likelihood(w, X, y):
        z = np.array([w if i == 0 else 0.0 for i, w in enumerate(w.reshape(1, X.shape[1])[0])]).reshape(X.shape[1], 1)
        score = np.dot(X, z).reshape(1, X.shape[0])
        return np.sum(-np.log(1 + np.exp(score))) + np.sum(y * score)

    k = float(X.shape[1])

    return 1.0 - ((full_log_likelihood(w, X, y) - k) / null_log_likelihood(w, X, y))

[mcfadden_adjusted_rsquare(w, T.X, T.y) for w in weights]

[7]:

[0.5624019131203336, 0.5184493056623285]

4.3.4. McKelvey & Zavoina r-squared

[8]:

def mz_rsquare(y_pred):
    return np.var(y_pred) / (np.var(y_pred) + (np.power(np.pi, 2.0) / 3.0) )

[mz_rsquare(y_pred) for y_pred in y_preds]

[8]:

[0.04114214468764824, 0.04107911565272002]

4.3.5. Count r-squared

[9]:

def count_rsquare(y, y_pred, t=0.5):
    def get_num_correct(y, y_pred, t=0.5):
        y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
        return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])

    n = float(len(y))
    num_correct = get_num_correct(y, y_pred, t)

    return num_correct / n

[count_rsquare(V.y, y_pred) for y_pred in y_preds]

[9]:

[0.865, 0.865]

4.3.6. Adjust count r-squared

[10]:

def count_adjusted_rsquare(y, y_pred, t=0.5):
    def get_num_correct(y, y_pred, t=0.5):
        y_correct = np.array([0.0 if p < t else 1.0 for p in y_pred])
        return sum([1.0 for p, p_pred in zip(y, y_correct) if p == p_pred])

    def get_count_most_freq_outcome(y):
        num_0 = 0
        num_1 = 0
        for p in y:
            if p == 1.0:
                num_1 += 1
            else:
                num_0 += 1
        return float(max(num_0, num_1))

    correct = get_num_correct(y, y_pred, t)
    total = float(len(y))
    n = get_count_most_freq_outcome(y)

    return (correct - n) / (total - n)

[count_adjusted_rsquare(V.y, y_pred) for y_pred in y_preds]

[10]:

[0.6723300970873787, 0.6723300970873787]