17. Feature Selection
[1]:
from sklearn.datasets import make_regression, make_classification
import numpy as np
np.random.seed(37)
def get_regression_data():
return make_regression(**{
'n_samples': 1000,
'n_features': 50,
'n_informative': 10,
'n_targets': 1,
'bias': 5.3,
'random_state': 37
})
def get_classification_data():
return make_classification(**{
'n_samples': 2000,
'n_features': 20,
'n_informative': 2,
'n_redundant': 2,
'n_repeated': 0,
'n_classes': 2,
'n_clusters_per_class': 2,
'random_state': 37
})
A, b = get_regression_data()
C, d = get_classification_data()
17.1. Univariate
Univariate feature selection determines importance of each feature individually. This approach is accomplished through GenericUnivariateSelect
. In a classification problem, use chi2
or mutual_info_classif
for the score function. Note that chi2
requires your feature matrix to be non-negative. There are a variety of modes, but we are using the percentile
and k_best
modes.
[2]:
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
def get_best_indexes(scores, max_index, reverse=True):
tups = sorted([(i, s) for i, s in enumerate(scores)], key=lambda tup: tup[1], reverse=reverse)
tups = tups[:max_index]
return [t[0] for t in tups]
def get_classification_performance(tr_index, te_index, X, y, selector):
X_tr, X_te = X[tr_index], X[te_index]
y_tr, y_te = y[tr_index], y[te_index]
rf = RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1)
model = Pipeline([
('selector', selector),
('rf', rf)
])
model.fit(X_tr, y_tr)
y_pr = model.predict_proba(X_te)[:, 1]
return roc_auc_score(y_te, y_pr)
p_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_classif,
'mode': 'percentile',
'param': 15
})
k_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_classif,
'mode': 'k_best',
'param': 2
})
tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))
print(get_classification_performance(tr_index, te_index, C, d, p_selector))
print(get_classification_performance(tr_index, te_index, C, d, k_selector))
0.9765500000000001
0.977
When your output variable is continuous, use f_regression
and mutual_info_regression
for the score function.
[3]:
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
def get_regression_performance(tr_index, te_index, X, y, selector):
X_tr, X_te = X[tr_index], X[te_index]
y_tr, y_te = y[tr_index], y[te_index]
rf = RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1)
model = Pipeline([
('selector', selector),
('rf', rf)
])
model.fit(X_tr, y_tr)
y_pr = model.predict(X_te)
return mean_absolute_error(y_te, y_pr)
fp_selector = GenericUnivariateSelect(**{
'score_func': f_regression,
'mode': 'percentile',
'param': 15
})
mp_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_regression,
'mode': 'percentile',
'param': 15
})
fk_selector = GenericUnivariateSelect(**{
'score_func': f_regression,
'mode': 'k_best',
'param': 2
})
mk_selector = GenericUnivariateSelect(**{
'score_func': mutual_info_regression,
'mode': 'k_best',
'param': 2
})
tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))
print(get_regression_performance(tr_index, te_index, A, b, fp_selector))
print(get_regression_performance(tr_index, te_index, A, b, fk_selector))
print(get_regression_performance(tr_index, te_index, A, b, mp_selector))
print(get_regression_performance(tr_index, te_index, A, b, mk_selector))
64.42457694230026
134.7419724227303
80.69262206441475
138.0925739351483
17.2. Models
Models maybe used to select features as well through using SelectFromModel
. The coefficients or variable importances of a model may be used to decide which features will be useful.
[4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
lr_selector = SelectFromModel(**{
'estimator': LogisticRegression(n_jobs=-1),
'max_features': 5
})
rf_selector = SelectFromModel(**{
'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
'max_features': 5
})
tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))
print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))
0.9742500000000001
0.9742500000000001
[5]:
from sklearn.linear_model import LinearRegression
lr_selector = SelectFromModel(**{
'estimator': LinearRegression(n_jobs=-1),
'max_features': 5
})
rf_selector = SelectFromModel(**{
'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
'max_features': 5
})
tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))
print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))
91.75100069641535
76.38248480356026
17.3. Sequential
Sequential feature selection by adding (foward) or removing (backward) features is done with SequentialFeatureSeletor
.
[6]:
from sklearn.feature_selection import SequentialFeatureSelector
lr_selector = SequentialFeatureSelector(**{
'estimator': LogisticRegression(n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'roc_auc'})
rf_selector = SequentialFeatureSelector(**{
'estimator': RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'roc_auc'})
tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))
print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))
0.9717999999999999
0.9739000000000001
[7]:
lr_selector = SequentialFeatureSelector(**{
'estimator': LinearRegression(n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'neg_mean_absolute_error'})
rf_selector = SequentialFeatureSelector(**{
'estimator': RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1),
'n_features_to_select': 5,
'n_jobs': -1,
'scoring': 'neg_mean_absolute_error'})
tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))
print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))
76.38248480356026
76.38248480356026
17.4. Recursive feature elimination
Recursive feature elimination with cross-validation is accomplished with RFECV
.
[8]:
from sklearn.feature_selection import RFECV
lr_selector = RFECV(LogisticRegression(n_jobs=-1), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
rf_selector = RFECV(RandomForestClassifier(max_depth=10, random_state=37, n_jobs=-1), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
tr_index, te_index = next(StratifiedKFold(n_splits=10, shuffle=True, random_state=37).split(C, d))
print(get_classification_performance(tr_index, te_index, C, d, lr_selector))
print(get_classification_performance(tr_index, te_index, C, d, rf_selector))
0.9783
0.9783
[9]:
lr_selector = RFECV(LinearRegression(n_jobs=-1), step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_selector = RFECV(RandomForestRegressor(max_depth=10, random_state=37, n_jobs=-1), step=1, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
tr_index, te_index = next(KFold(n_splits=10, shuffle=True, random_state=37).split(A, b))
print(get_regression_performance(tr_index, te_index, A, b, lr_selector))
print(get_regression_performance(tr_index, te_index, A, b, rf_selector))
65.99916557482489
64.42457694230026
17.5. Feature engineering with selection
Here, we will show an example of how to do feature engineering with feature selection in a text classification problem. First, we will try to see if feature engineering (vectorization) and selection can help with classification.
[10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
text = [
'Data Science from Scratch: First Principles with Python',
'Data Science for Business: What You Need to Know about Data Mining and Data-Analytic Thinking',
'Practical Statistics for Data Scientists',
'Build a Career in Data Science',
'Python Data Science Handbook',
'Storytelling with Data: A Data Visualization Guide for Business Professionals',
'R for Data Science: Import, Tidy, Transform, Visualize, and Model Data',
'Data-Driven Science and Engineering: Machine Learning, Dynamical Systems, and Control',
'A Hands-On Introduction to Data Science',
'Intro to Python for Computer Science and Data Science: Learning to Program with AI, Big Data and The Cloud',
'How Finance Works: The HBR Guide to Thinking Smart About the Numbers',
'The Intelligent Investor: The Definitive Book on Value Investing. A Book of Practical Counsel',
'Introduction to Finance: Markets, Investments, and Financial Management',
'Python for Finance: Mastering Data-Driven Finance',
'The Infographic Guide to Personal Finance: A Visual Reference for Everything You Need to Know',
'Personal Finance For Dummies',
'Corporate Finance For Dummies',
'Lords of Finance: The Bankers Who Broke the World',
'Real Estate Finance & Investments',
'Real Estate Finance and Investments Risks and Opportunities'
]
clazz = [1 for _ in range(10)] + [0 for _ in range(10)]
with open('stop-words.txt', 'r') as f:
stop_words = set([word.strip() for word in f if len(word.strip()) > 0])
vectorizer = CountVectorizer(binary=True, stop_words=stop_words,
ngram_range=(1, 2))
selector = RFECV(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=37),
step=1, cv=5, scoring='roc_auc', n_jobs=-1)
regressor = LogisticRegression(penalty='l2', solver='liblinear',
fit_intercept=False, C=0.01, random_state=37)
pipeline = Pipeline([
('vectorizer', vectorizer),
('selector', selector),
('regressor', regressor)
])
pipeline.fit(text, clazz)
y_pred = pipeline.predict(text)
roc_auc_score(clazz, y_pred)
[10]:
1.0
The most important features (or words or phrases) are reduced to just 3.
[11]:
import pandas as pd
features_selected = sorted([(n, r) for n, r in zip(vectorizer.get_feature_names(), selector.ranking_)],
key=lambda tup: tup[1])[0:selector.n_features_]
s = pd.Series(regressor.coef_[0], index=[tup[0] for tup in features_selected])
s
[11]:
data 0.043157
finance -0.044115
science 0.038370
dtype: float64
Here, we do a k-fold cross validation. As can be seen below, the accuracy is pretty high.
[12]:
def get_model():
vectorizer = CountVectorizer(binary=True, stop_words=stop_words,
ngram_range=(1, 2))
selector = RFECV(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=37),
step=1, cv=5, scoring='roc_auc', n_jobs=-1)
regressor = LogisticRegression(penalty='l2', solver='liblinear',
fit_intercept=False, C=0.01, random_state=37)
pipeline = Pipeline([
('vectorizer', vectorizer),
('selector', selector),
('regressor', regressor)
])
return pipeline
results = []
for fold, (tr, te) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=37).split(text, clazz)):
X = np.array(text)
y = np.array(clazz)
X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
model = get_model()
model.fit(X_tr, y_tr)
y_pred = model.predict_proba(X_te)[:, 1]
score = roc_auc_score(y_te, y_pred)
vectorizer = model['vectorizer']
selector = model['selector']
features = vectorizer.get_feature_names()
rankings = selector.ranking_
features_selected = sorted([(n, r) for n, r in zip(features, rankings)],
key=lambda tup: tup[1])[0:selector.n_features_]
features_selected = [tup[0] for tup in features_selected]
regressor = model['regressor']
coefs = regressor.coef_[0]
features = {}
for i, (f, c) in enumerate(zip(features_selected, coefs)):
fname = f'x{i}'
cname = f'c{i}'
features[fname] = f
features[cname] = c
result = {**{'fold': fold, 'auc': score}, **features}
results.append(result)
[13]:
pd.DataFrame(results)
[13]:
fold | auc | x0 | c0 | x1 | c1 | x2 | c2 | x3 | c3 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | data data | 0.004988 | data science | 0.029055 | finance | -0.034398 | science | 0.03397 |
1 | 1 | 1.0 | data | 0.033475 | data science | 0.028640 | finance | -0.034480 | science | 0.02864 |
2 | 2 | 1.0 | data | 0.033888 | finance | -0.034481 | science | 0.029056 | NaN | NaN |
3 | 3 | 1.0 | data | 0.033900 | finance | -0.039299 | science | 0.029056 | NaN | NaN |
4 | 4 | 1.0 | data | 0.038637 | finance | -0.034398 | science | 0.033734 | NaN | NaN |