11. Data Preprocessing
11.1. Binarizer
[1]:
from sklearn.preprocessing import Binarizer
X = [[1.0],
[-1.0],
[1.0],
[-1.0]]
t = Binarizer()
t.fit(X)
t.transform(X)
[1]:
array([[1.],
[0.],
[1.],
[0.]])
11.2. K-bins discretizer
[2]:
from sklearn.preprocessing import KBinsDiscretizer
X = [[-2.0, 2.0],
[-1.0, 1.0],
[0.0, 0.0],
[1.0, -1.0],
[2.0, -2.0]]
t = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
t.fit(X)
t.transform(X)
[2]:
array([[0., 2.],
[0., 2.],
[1., 1.],
[2., 0.],
[2., 0.]])
11.3. Label binarizer
[3]:
from sklearn.preprocessing import LabelBinarizer
X = [0, 0, 2, 2, 1, 1]
t = LabelBinarizer()
t.fit([0, 1, 2])
[3]:
LabelBinarizer()
[4]:
o = t.transform(X)
o
[4]:
array([[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0]])
[5]:
t.inverse_transform(o)
[5]:
array([0, 0, 2, 2, 1, 1])
11.4. Label encoder
[6]:
from sklearn.preprocessing import LabelEncoder
X = [99, 100, 200]
t = LabelEncoder()
t.fit(X)
[6]:
LabelEncoder()
[7]:
o = t.transform([99, 99, 100, 200, 100, 200])
o
[7]:
array([0, 0, 1, 2, 1, 2])
[8]:
t.inverse_transform(o)
[8]:
array([ 99, 99, 100, 200, 100, 200])
11.5. Multi-label binarizer
[9]:
from sklearn.preprocessing import MultiLabelBinarizer
X = [[3, 4], [5], [6]]
t = MultiLabelBinarizer()
t.fit(X)
[9]:
MultiLabelBinarizer()
[10]:
o = t.transform(X)
o
[10]:
array([[1, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]])
[11]:
t.inverse_transform(o)
[11]:
[(3, 4), (5,), (6,)]
11.6. Maximum absolute scaler
[12]:
from sklearn.preprocessing import MaxAbsScaler
X = [[0, -1, 0],
[1, 0, -1],
[2, 1, 2]]
t = MaxAbsScaler()
t.fit(X)
[12]:
MaxAbsScaler()
[13]:
o = t.transform(X)
o
[13]:
array([[ 0. , -1. , 0. ],
[ 0.5, 0. , -0.5],
[ 1. , 1. , 1. ]])
[14]:
t.inverse_transform(o)
[14]:
array([[ 0., -1., 0.],
[ 1., 0., -1.],
[ 2., 1., 2.]])
11.7. Min-max scaler
[15]:
from sklearn.preprocessing import MinMaxScaler
X = [[0, 10, 0],
[1, 5, -1],
[2, 0, -2]]
t = MinMaxScaler()
t.fit(X)
[15]:
MinMaxScaler()
[16]:
o = t.transform(X)
o
[16]:
array([[0. , 1. , 1. ],
[0.5, 0.5, 0.5],
[1. , 0. , 0. ]])
[17]:
t.inverse_transform(o)
[17]:
array([[ 0., 10., 0.],
[ 1., 5., -1.],
[ 2., 0., -2.]])
11.8. Normalizer
[18]:
from sklearn.preprocessing import Normalizer
X = [[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]]
t = Normalizer()
t.fit(X)
[18]:
Normalizer()
[19]:
t.transform(X)
[19]:
array([[0.18257419, 0.36514837, 0.54772256, 0.73029674],
[0.37904902, 0.45485883, 0.53066863, 0.60647843],
[0.42616235, 0.47351372, 0.5208651 , 0.56821647]])
11.9. One-hot encoder
[20]:
from sklearn.preprocessing import OneHotEncoder
X = [['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]]
t = OneHotEncoder(handle_unknown='ignore')
t.fit(X)
[20]:
OneHotEncoder(handle_unknown='ignore')
[21]:
o = t.transform(X).todense()
o
[21]:
matrix([[1., 0., 0., 1., 0., 0.],
[0., 1., 1., 0., 0., 0.],
[1., 0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0., 1.]])
[22]:
t.inverse_transform(o)
[22]:
array([['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]], dtype=object)
11.10. Ordinal encoder
[23]:
from sklearn.preprocessing import OrdinalEncoder
X = [['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]]
t = OrdinalEncoder()
t.fit(X)
[23]:
OrdinalEncoder()
[24]:
o = t.transform(X)
o
[24]:
array([[0., 1.],
[1., 0.],
[0., 2.],
[1., 3.]])
[25]:
t.inverse_transform(o)
[25]:
array([['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]], dtype=object)
11.11. Polynomial features
[26]:
from sklearn.preprocessing import PolynomialFeatures
X = [[0, 1],
[2, 3],
[4, 5]]
t = PolynomialFeatures(2)
t.fit(X)
[26]:
PolynomialFeatures()
[27]:
t.transform(X)
[27]:
array([[ 1., 0., 1., 0., 0., 1.],
[ 1., 2., 3., 4., 6., 9.],
[ 1., 4., 5., 16., 20., 25.]])
11.12. Power transformer
[28]:
from sklearn.preprocessing import PowerTransformer
X = [[0, 1],
[2, 3],
[4, 5]]
t = PowerTransformer()
t.fit(X)
[28]:
PowerTransformer()
[29]:
o = t.transform(X)
o
[29]:
array([[-1.28608295, -1.26755013],
[ 0.13363692, 0.09064754],
[ 1.15244602, 1.17690259]])
[30]:
t.inverse_transform(o)
[30]:
array([[0., 1.],
[2., 3.],
[4., 5.]])
11.13. Robust scaler
[31]:
from sklearn.preprocessing import RobustScaler
X = [[1.,-2.,2.],
[ -2.,1.,3.],
[ 4.,1.,-2.]]
t = RobustScaler(with_centering=False)
t.fit(X)
[31]:
RobustScaler(with_centering=False)
[32]:
o = t.transform(X)
o
[32]:
array([[ 0.33333333, -1.33333333, 0.8 ],
[-0.66666667, 0.66666667, 1.2 ],
[ 1.33333333, 0.66666667, -0.8 ]])
[33]:
t.inverse_transform(o)
[33]:
array([[ 1., -2., 2.],
[-2., 1., 3.],
[ 4., 1., -2.]])
11.14. Standard scaler
[34]:
from sklearn.preprocessing import StandardScaler
X = [[0, 0],
[0, 0],
[1, 1],
[1, 1]]
t = StandardScaler()
t.fit(X)
[34]:
StandardScaler()
[35]:
o = t.transform(X)
o
[35]:
array([[-1., -1.],
[-1., -1.],
[ 1., 1.],
[ 1., 1.]])
[36]:
t.inverse_transform(o)
[36]:
array([[0., 0.],
[0., 0.],
[1., 1.],
[1., 1.]])
11.15. Column transformation
ColumnTransformer
can be used with a list of transformers to transform a data set.
[37]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np
np.random.seed(37)
X = np.random.normal(10, 1, 10).reshape((5, 2))
preprocessor = ColumnTransformer([
('log', FunctionTransformer(np.log), [0]),
('round', FunctionTransformer(np.round), [1])
])
X, preprocessor.fit_transform(X)
[37]:
(array([[ 9.94553639, 10.67430807],
[10.34664703, 8.69965383],
[11.51851188, 10.98982371],
[10.2776809 , 9.55141065],
[10.96196624, 9.17242136]]),
array([[ 2.29712385, 11. ],
[ 2.33666251, 9. ],
[ 2.44395547, 11. ],
[ 2.32997464, 10. ],
[ 2.39443167, 9. ]]))
A Pandas dataframe can also be transformed with ColumnTransformer
as well.
[38]:
import pandas as pd
df = pd.DataFrame(X, columns=['x1', 'x2'])
preprocessor = ColumnTransformer([
('log', FunctionTransformer(np.log), ['x1']),
('round', FunctionTransformer(np.round), ['x2'])
])
preprocessor.fit_transform(df)
[38]:
array([[ 2.29712385, 11. ],
[ 2.33666251, 9. ],
[ 2.44395547, 11. ],
[ 2.32997464, 10. ],
[ 2.39443167, 9. ]])