17. Feature Selection

[1]:
from sklearn.datasets import make_regression, make_classification
import numpy as np

np.random.seed(37)

def get_regression_data():
    return make_regression(**{
        'n_samples': 1000,
        'n_features': 50,
        'n_informative': 10,
        'n_targets': 1,
        'bias': 5.3,
        'random_state': 37
    })

def get_classification_data():
    return make_classification(**{
        'n_samples': 2000,
        'n_features': 20,
        'n_informative': 2,
        'n_redundant': 2,
        'n_repeated': 0,
        'n_classes': 2,
        'n_clusters_per_class': 2,
        'random_state': 37
    })

A, b = get_regression_data()
C, d = get_classification_data()

17.1. Univariate

Univariate feature selection determines importance of each feature individually. This approach is accomplished through GenericUnivariateSelect. In a classification problem, use chi2 or mutual_info_classif for the score function. Note that chi2 requires your feature matrix to be non-negative. There are a variety of modes, but we are using the percentile and k_best modes.

[2]:
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

def get_best_indexes(scores, max_index, reverse=True):
    tups = sorted([(i, s) for i, s in enumerate(scores)], key=lambda tup: tup[1], reverse=reverse)
    tups = tups[:max_index]
    return [t[0] for t in tups]

def get_classification_performance(tr_index, te_index, X, y, selector):
    X_tr, X_te = X[tr_index], X[te_index]
    y_tr, y_te = y[tr_index], y[te_index]

    rf = RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1)

    model = Pipeline([
        ('selector', selector),
        ('rf', rf)
    ])

    model.fit(X_tr, y_tr)
    y_pr = model.predict_proba(X_te)[:, 1]

    return roc_auc_score(y_te, y_pr)

p_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_classif,
    'mode': 'percentile',
    'param': 15
})

k_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_classif,
    'mode': 'k_best',
    'param': 2
})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, p_selector))
print(get_classification_performance(tr_index, te_index, C, d, k_selector))
0.9765500000000001
0.977

When your output variable is continuous, use f_regression and mutual_info_regression for the score function.

[3]:
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

def get_regression_performance(tr_index, te_index, X, y, selector):
    X_tr, X_te = X[tr_index], X[te_index]
    y_tr, y_te = y[tr_index], y[te_index]

    rf = RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1)

    model = Pipeline([
        ('selector', selector),
        ('rf', rf)
    ])

    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te)

    return mean_absolute_error(y_te, y_pr)

fp_selector = GenericUnivariateSelect(**{
    'score_func': f_regression,
    'mode': 'percentile',
    'param': 15
})

mp_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_regression,
    'mode': 'percentile',
    'param': 15
})

fk_selector = GenericUnivariateSelect(**{
    'score_func': f_regression,
    'mode': 'k_best',
    'param': 2
})

mk_selector = GenericUnivariateSelect(**{
    'score_func': mutual_info_regression,
    'mode': 'k_best',
    'param': 2
})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, fp_selector))
print(get_regression_performance(tr_index, te_index, A, b, fk_selector))
print(get_regression_performance(tr_index, te_index, A, b, mp_selector))
print(get_regression_performance(tr_index, te_index, A, b, mk_selector))
64.42457694230026
134.7419724227303
80.69262206441475
138.0925739351483

17.2. Models

Models maybe used to select features as well through using SelectFromModel. The coefficients or variable importances of a model may be used to decide which features will be useful.

[4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

lr_selector = SelectFromModel(**{
    'estimator': LogisticRegression(n_jobs=-1),
    'max_features': 5
})

rf_selector = SelectFromModel(**{
    'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
    'max_features': 5
})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))
0.9742500000000001
0.9742500000000001
[5]:
from sklearn.linear_model import LinearRegression

lr_selector = SelectFromModel(**{
    'estimator': LinearRegression(n_jobs=-1),
    'max_features': 5
})

rf_selector = SelectFromModel(**{
    'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
    'max_features': 5
})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))
91.75100069641535
76.38248480356026

17.3. Sequential

Sequential feature selection by adding (foward) or removing (backward) features is done with SequentialFeatureSeletor.

[6]:
from sklearn.feature_selection import SequentialFeatureSelector

lr_selector = SequentialFeatureSelector(**{
    'estimator': LogisticRegression(n_jobs=-1),
    'n_features_to_select': 5,
    'n_jobs': -1,
    'scoring': 'roc_auc'})

rf_selector = SequentialFeatureSelector(**{
    'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
    'n_features_to_select': 5,
    'n_jobs': -1,
    'scoring': 'roc_auc'})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))
0.9717999999999999
0.9739000000000001
[7]:
lr_selector = SequentialFeatureSelector(**{
    'estimator': LinearRegression(n_jobs=-1),
    'n_features_to_select': 5,
    'n_jobs': -1,
    'scoring': 'neg_mean_absolute_error'})

rf_selector = SequentialFeatureSelector(**{
    'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
    'n_features_to_select': 5,
    'n_jobs': -1,
    'scoring': 'neg_mean_absolute_error'})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))
76.38248480356026
76.38248480356026

17.4. Recursive feature elimination

Recursive feature elimination with cross-validation is accomplished with RFECV.

[8]:
from sklearn.feature_selection import RFECV

lr_selector = RFECV(LogisticRegression(n_jobs=-1), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
rf_selector = RFECV(RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1), step=1, cv=5, scoring='roc_auc', n_jobs=-1)

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))
0.9783
0.9783
[9]:
lr_selector = RFECV(LinearRegression(n_jobs=-1), step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_selector = RFECV(RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1), step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))
65.99916557482489
64.42457694230026

17.5. Feature engineering with selection

Here, we will show an example of how to do feature engineering with feature selection in a text classification problem. First, we will try to see if feature engineering (vectorization) and selection can help with classification.

[10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

text = [
    'Data Science from Scratch: First Principles with Python',
    'Data Science for Business: What You Need to Know about Data Mining and Data-Analytic Thinking',
    'Practical Statistics for Data Scientists',
    'Build a Career in Data Science',
    'Python Data Science Handbook',
    'Storytelling with Data: A Data Visualization Guide for Business Professionals',
    'R for Data Science: Import, Tidy, Transform, Visualize, and Model Data',
    'Data-Driven Science and Engineering: Machine Learning, Dynamical Systems, and Control',
    'A Hands-On Introduction to Data Science',
    'Intro to Python for Computer Science and Data Science: Learning to Program with AI, Big Data and The Cloud',
    'How Finance Works: The HBR Guide to Thinking Smart About the Numbers',
    'The Intelligent Investor: The Definitive Book on Value Investing. A Book of Practical Counsel',
    'Introduction to Finance: Markets, Investments, and Financial Management',
    'Python for Finance: Mastering Data-Driven Finance',
    'The Infographic Guide to Personal Finance: A Visual Reference for Everything You Need to Know',
    'Personal Finance For Dummies',
    'Corporate Finance For Dummies',
    'Lords of Finance: The Bankers Who Broke the World',
    'Real Estate Finance & Investments',
    'Real Estate Finance and Investments Risks and Opportunities'
]

clazz = [1 for _ in range(10)] + [0 for _ in range(10)]

with open('stop-words.txt', 'r') as f:
    stop_words = set([word.strip() for word in f if len(word.strip()) > 0])

vectorizer = CountVectorizer(binary=True, stop_words=stop_words,
                             ngram_range=(1, 2))
selector = RFECV(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=37),
                 step=1, cv=5, scoring='roc_auc', n_jobs=-1)
regressor = LogisticRegression(penalty='l2', solver='liblinear',
                               fit_intercept=False, C=0.01, random_state=37)
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('selector', selector),
    ('regressor', regressor)
])

pipeline.fit(text, clazz)
y_pred = pipeline.predict(text)
roc_auc_score(clazz, y_pred)
[10]:
1.0

The most important features (or words or phrases) are reduced to just 3.

[11]:
import pandas as pd

features_selected = sorted([(n, r) for n, r in zip(vectorizer.get_feature_names(), selector.ranking_)],
                           key=lambda tup: tup[1])[0:selector.n_features_]
s = pd.Series(regressor.coef_[0], index=[tup[0] for tup in features_selected])
s
[11]:
data       0.043157
finance   -0.044115
science    0.038370
dtype: float64

Here, we do a k-fold cross validation. As can be seen below, the accuracy is pretty high.

[12]:
def get_model():
    vectorizer = CountVectorizer(binary=True, stop_words=stop_words,
                             ngram_range=(1, 2))
    selector = RFECV(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=37),
                     step=1, cv=5, scoring='roc_auc', n_jobs=-1)
    regressor = LogisticRegression(penalty='l2', solver='liblinear',
                                   fit_intercept=False, C=0.01, random_state=37)
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('selector', selector),
        ('regressor', regressor)
    ])

    return pipeline

results = []

for fold, (tr, te) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=37).split(text, clazz)):
    X = np.array(text)
    y = np.array(clazz)

    X_tr, X_te = X[tr], X[te]
    y_tr, y_te = y[tr], y[te]

    model = get_model()
    model.fit(X_tr, y_tr)
    y_pred = model.predict_proba(X_te)[:, 1]

    score = roc_auc_score(y_te, y_pred)

    vectorizer = model['vectorizer']
    selector = model['selector']

    features = vectorizer.get_feature_names()
    rankings = selector.ranking_

    features_selected = sorted([(n, r) for n, r in zip(features, rankings)],
                               key=lambda tup: tup[1])[0:selector.n_features_]
    features_selected = [tup[0] for tup in features_selected]

    regressor = model['regressor']
    coefs = regressor.coef_[0]

    features = {}
    for i, (f, c) in enumerate(zip(features_selected, coefs)):
        fname = f'x{i}'
        cname = f'c{i}'
        features[fname] = f
        features[cname] = c

    result = {**{'fold': fold, 'auc': score}, **features}
    results.append(result)
[13]:
pd.DataFrame(results)
[13]:
fold auc x0 c0 x1 c1 x2 c2 x3 c3
0 0 1.0 data data 0.004988 data science 0.029055 finance -0.034398 science 0.03397
1 1 1.0 data 0.033475 data science 0.028640 finance -0.034480 science 0.02864
2 2 1.0 data 0.033888 finance -0.034481 science 0.029056 NaN NaN
3 3 1.0 data 0.033900 finance -0.039299 science 0.029056 NaN NaN
4 4 1.0 data 0.038637 finance -0.034398 science 0.033734 NaN NaN