# 17. Feature Selection

[1]:

from sklearn.datasets import make_regression, make_classification
import numpy as np

np.random.seed(37)

def get_regression_data():
return make_regression(**{
'n_samples': 1000,
'n_features': 50,
'n_informative': 10,
'n_targets': 1,
'bias': 5.3,
'random_state': 37
})

def get_classification_data():
return make_classification(**{
'n_samples': 2000,
'n_features': 20,
'n_informative': 2,
'n_redundant': 2,
'n_repeated': 0,
'n_classes': 2,
'n_clusters_per_class': 2,
'random_state': 37
})

A, b = get_regression_data()
C, d = get_classification_data()


## 17.1. Univariate

Univariate feature selection determines importance of each feature individually. This approach is accomplished through GenericUnivariateSelect. In a classification problem, use chi2 or mutual_info_classif for the score function. Note that chi2 requires your feature matrix to be non-negative. There are a variety of modes, but we are using the percentile and k_best modes.

[2]:

from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

def get_best_indexes(scores, max_index, reverse=True):
tups = sorted([(i, s) for i, s in enumerate(scores)], key=lambda tup: tup[1], reverse=reverse)
tups = tups[:max_index]
return [t[0] for t in tups]

def get_classification_performance(tr_index, te_index, X, y, selector):
X_tr, X_te = X[tr_index], X[te_index]
y_tr, y_te = y[tr_index], y[te_index]

rf = RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1)

model = Pipeline([
('selector', selector),
('rf', rf)
])

model.fit(X_tr, y_tr)
y_pr = model.predict_proba(X_te)[:, 1]

return roc_auc_score(y_te, y_pr)

p_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_classif,
'mode': 'percentile',
'param': 15
})

k_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_classif,
'mode': 'k_best',
'param': 2
})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, p_selector))
print(get_classification_performance(tr_index, te_index, C, d, k_selector))

0.9765500000000001
0.977


When your output variable is continuous, use f_regression and mutual_info_regression for the score function.

[3]:

from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

def get_regression_performance(tr_index, te_index, X, y, selector):
X_tr, X_te = X[tr_index], X[te_index]
y_tr, y_te = y[tr_index], y[te_index]

rf = RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1)

model = Pipeline([
('selector', selector),
('rf', rf)
])

model.fit(X_tr, y_tr)
y_pr = model.predict(X_te)

return mean_absolute_error(y_te, y_pr)

fp_selector = GenericUnivariateSelect(**{
'score_func': f_regression,
'mode': 'percentile',
'param': 15
})

mp_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_regression,
'mode': 'percentile',
'param': 15
})

fk_selector = GenericUnivariateSelect(**{
'score_func': f_regression,
'mode': 'k_best',
'param': 2
})

mk_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_regression,
'mode': 'k_best',
'param': 2
})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, fp_selector))
print(get_regression_performance(tr_index, te_index, A, b, fk_selector))
print(get_regression_performance(tr_index, te_index, A, b, mp_selector))
print(get_regression_performance(tr_index, te_index, A, b, mk_selector))

64.42457694230026
134.7419724227303
80.69262206441475
138.0925739351483


## 17.2. Models

Models maybe used to select features as well through using SelectFromModel. The coefficients or variable importances of a model may be used to decide which features will be useful.

[4]:

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

lr_selector = SelectFromModel(**{
'estimator': LogisticRegression(n_jobs=-1),
'max_features': 5
})

rf_selector = SelectFromModel(**{
'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
'max_features': 5
})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))

0.9742500000000001
0.9742500000000001

[5]:

from sklearn.linear_model import LinearRegression

lr_selector = SelectFromModel(**{
'estimator': LinearRegression(n_jobs=-1),
'max_features': 5
})

rf_selector = SelectFromModel(**{
'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
'max_features': 5
})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))

91.75100069641535
76.38248480356026


## 17.3. Sequential

Sequential feature selection by adding (foward) or removing (backward) features is done with SequentialFeatureSeletor.

[6]:

from sklearn.feature_selection import SequentialFeatureSelector

lr_selector = SequentialFeatureSelector(**{
'estimator': LogisticRegression(n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'roc_auc'})

rf_selector = SequentialFeatureSelector(**{
'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'roc_auc'})

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))

0.9717999999999999
0.9739000000000001

[7]:

lr_selector = SequentialFeatureSelector(**{
'estimator': LinearRegression(n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'neg_mean_absolute_error'})

rf_selector = SequentialFeatureSelector(**{
'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'neg_mean_absolute_error'})

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))

76.38248480356026
76.38248480356026


## 17.4. Recursive feature elimination

Recursive feature elimination with cross-validation is accomplished with RFECV.

[8]:

from sklearn.feature_selection import RFECV

lr_selector = RFECV(LogisticRegression(n_jobs=-1), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
rf_selector = RFECV(RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1), step=1, cv=5, scoring='roc_auc', n_jobs=-1)

tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))

print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))

0.9783
0.9783

[9]:

lr_selector = RFECV(LinearRegression(n_jobs=-1), step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_selector = RFECV(RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1), step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))

print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))

65.99916557482489
64.42457694230026


## 17.5. Feature engineering with selection

Here, we will show an example of how to do feature engineering with feature selection in a text classification problem. First, we will try to see if feature engineering (vectorization) and selection can help with classification.

[10]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

text = [
'Data Science from Scratch: First Principles with Python',
'Data Science for Business: What You Need to Know about Data Mining and Data-Analytic Thinking',
'Practical Statistics for Data Scientists',
'Build a Career in Data Science',
'Python Data Science Handbook',
'Storytelling with Data: A Data Visualization Guide for Business Professionals',
'R for Data Science: Import, Tidy, Transform, Visualize, and Model Data',
'Data-Driven Science and Engineering: Machine Learning, Dynamical Systems, and Control',
'A Hands-On Introduction to Data Science',
'Intro to Python for Computer Science and Data Science: Learning to Program with AI, Big Data and The Cloud',
'How Finance Works: The HBR Guide to Thinking Smart About the Numbers',
'The Intelligent Investor: The Definitive Book on Value Investing. A Book of Practical Counsel',
'Introduction to Finance: Markets, Investments, and Financial Management',
'Python for Finance: Mastering Data-Driven Finance',
'The Infographic Guide to Personal Finance: A Visual Reference for Everything You Need to Know',
'Personal Finance For Dummies',
'Corporate Finance For Dummies',
'Lords of Finance: The Bankers Who Broke the World',
'Real Estate Finance & Investments',
'Real Estate Finance and Investments Risks and Opportunities'
]

clazz = [1 for _ in range(10)] + [0 for _ in range(10)]

with open('stop-words.txt', 'r') as f:
stop_words = set([word.strip() for word in f if len(word.strip()) > 0])

vectorizer = CountVectorizer(binary=True, stop_words=stop_words,
ngram_range=(1, 2))
selector = RFECV(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=37),
step=1, cv=5, scoring='roc_auc', n_jobs=-1)
regressor = LogisticRegression(penalty='l2', solver='liblinear',
fit_intercept=False, C=0.01, random_state=37)
pipeline = Pipeline([
('vectorizer', vectorizer),
('selector', selector),
('regressor', regressor)
])

pipeline.fit(text, clazz)
y_pred = pipeline.predict(text)
roc_auc_score(clazz, y_pred)

[10]:

1.0


The most important features (or words or phrases) are reduced to just 3.

[11]:

import pandas as pd

features_selected = sorted([(n, r) for n, r in zip(vectorizer.get_feature_names(), selector.ranking_)],
key=lambda tup: tup[1])[0:selector.n_features_]
s = pd.Series(regressor.coef_[0], index=[tup[0] for tup in features_selected])
s

[11]:

data       0.043157
finance   -0.044115
science    0.038370
dtype: float64


Here, we do a k-fold cross validation. As can be seen below, the accuracy is pretty high.

[12]:

def get_model():
vectorizer = CountVectorizer(binary=True, stop_words=stop_words,
ngram_range=(1, 2))
selector = RFECV(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=37),
step=1, cv=5, scoring='roc_auc', n_jobs=-1)
regressor = LogisticRegression(penalty='l2', solver='liblinear',
fit_intercept=False, C=0.01, random_state=37)
pipeline = Pipeline([
('vectorizer', vectorizer),
('selector', selector),
('regressor', regressor)
])

return pipeline

results = []

for fold, (tr, te) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=37).split(text, clazz)):
X = np.array(text)
y = np.array(clazz)

X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]

model = get_model()
model.fit(X_tr, y_tr)
y_pred = model.predict_proba(X_te)[:, 1]

score = roc_auc_score(y_te, y_pred)

vectorizer = model['vectorizer']
selector = model['selector']

features = vectorizer.get_feature_names()
rankings = selector.ranking_

features_selected = sorted([(n, r) for n, r in zip(features, rankings)],
key=lambda tup: tup[1])[0:selector.n_features_]
features_selected = [tup[0] for tup in features_selected]

regressor = model['regressor']
coefs = regressor.coef_[0]

features = {}
for i, (f, c) in enumerate(zip(features_selected, coefs)):
fname = f'x{i}'
cname = f'c{i}'
features[fname] = f
features[cname] = c

result = {**{'fold': fold, 'auc': score}, **features}
results.append(result)

[13]:

pd.DataFrame(results)

[13]:

fold auc x0 c0 x1 c1 x2 c2 x3 c3
0 0 1.0 data data 0.004988 data science 0.029055 finance -0.034398 science 0.03397
1 1 1.0 data 0.033475 data science 0.028640 finance -0.034480 science 0.02864
2 2 1.0 data 0.033888 finance -0.034481 science 0.029056 NaN NaN
3 3 1.0 data 0.033900 finance -0.039299 science 0.029056 NaN NaN
4 4 1.0 data 0.038637 finance -0.034398 science 0.033734 NaN NaN