11. Data Preprocessing

11.1. Binarizer

[1]:
from sklearn.preprocessing import Binarizer

X = [[1.0],
     [-1.0],
     [1.0],
     [-1.0]]

t = Binarizer()
t.fit(X)
t.transform(X)
[1]:
array([[1.],
       [0.],
       [1.],
       [0.]])

11.2. K-bins discretizer

[2]:
from sklearn.preprocessing import KBinsDiscretizer

X = [[-2.0, 2.0],
     [-1.0, 1.0],
     [0.0, 0.0],
     [1.0, -1.0],
     [2.0, -2.0]]

t = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
t.fit(X)
t.transform(X)
[2]:
array([[0., 2.],
       [0., 2.],
       [1., 1.],
       [2., 0.],
       [2., 0.]])

11.3. Label binarizer

[3]:
from sklearn.preprocessing import LabelBinarizer

X = [0, 0, 2, 2, 1, 1]

t = LabelBinarizer()
t.fit([0, 1, 2])
[3]:
LabelBinarizer()
[4]:
o = t.transform(X)
o
[4]:
array([[1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0]])
[5]:
t.inverse_transform(o)
[5]:
array([0, 0, 2, 2, 1, 1])

11.4. Label encoder

[6]:
from sklearn.preprocessing import LabelEncoder

X = [99, 100, 200]

t = LabelEncoder()
t.fit(X)
[6]:
LabelEncoder()
[7]:
o = t.transform([99, 99, 100, 200, 100, 200])
o
[7]:
array([0, 0, 1, 2, 1, 2])
[8]:
t.inverse_transform(o)
[8]:
array([ 99,  99, 100, 200, 100, 200])

11.5. Multi-label binarizer

[9]:
from sklearn.preprocessing import MultiLabelBinarizer

X = [[3, 4], [5], [6]]

t = MultiLabelBinarizer()
t.fit(X)
[9]:
MultiLabelBinarizer()
[10]:
o = t.transform(X)
o
[10]:
array([[1, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])
[11]:
t.inverse_transform(o)
[11]:
[(3, 4), (5,), (6,)]

11.6. Maximum absolute scaler

[12]:
from sklearn.preprocessing import MaxAbsScaler

X = [[0, -1, 0],
     [1, 0, -1],
     [2, 1, 2]]

t = MaxAbsScaler()
t.fit(X)
[12]:
MaxAbsScaler()
[13]:
o = t.transform(X)
o
[13]:
array([[ 0. , -1. ,  0. ],
       [ 0.5,  0. , -0.5],
       [ 1. ,  1. ,  1. ]])
[14]:
t.inverse_transform(o)
[14]:
array([[ 0., -1.,  0.],
       [ 1.,  0., -1.],
       [ 2.,  1.,  2.]])

11.7. Min-max scaler

[15]:
from sklearn.preprocessing import MinMaxScaler

X = [[0, 10, 0],
     [1, 5, -1],
     [2, 0, -2]]

t = MinMaxScaler()
t.fit(X)
[15]:
MinMaxScaler()
[16]:
o = t.transform(X)
o
[16]:
array([[0. , 1. , 1. ],
       [0.5, 0.5, 0.5],
       [1. , 0. , 0. ]])
[17]:
t.inverse_transform(o)
[17]:
array([[ 0., 10.,  0.],
       [ 1.,  5., -1.],
       [ 2.,  0., -2.]])

11.8. Normalizer

[18]:
from sklearn.preprocessing import Normalizer

X = [[1, 2, 3, 4],
     [5, 6, 7, 8],
     [9, 10, 11, 12]]

t = Normalizer()
t.fit(X)
[18]:
Normalizer()
[19]:
t.transform(X)
[19]:
array([[0.18257419, 0.36514837, 0.54772256, 0.73029674],
       [0.37904902, 0.45485883, 0.53066863, 0.60647843],
       [0.42616235, 0.47351372, 0.5208651 , 0.56821647]])

11.9. One-hot encoder

[20]:
from sklearn.preprocessing import OneHotEncoder

X = [['boy', 2],
     ['girl', 1],
     ['boy', 3],
     ['girl', 4]]

t = OneHotEncoder(handle_unknown='ignore')
t.fit(X)
[20]:
OneHotEncoder(handle_unknown='ignore')
[21]:
o = t.transform(X).todense()
o
[21]:
matrix([[1., 0., 0., 1., 0., 0.],
        [0., 1., 1., 0., 0., 0.],
        [1., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 1.]])
[22]:
t.inverse_transform(o)
[22]:
array([['boy', 2],
       ['girl', 1],
       ['boy', 3],
       ['girl', 4]], dtype=object)

11.10. Ordinal encoder

[23]:
from sklearn.preprocessing import OrdinalEncoder

X = [['boy', 2],
     ['girl', 1],
     ['boy', 3],
     ['girl', 4]]

t = OrdinalEncoder()
t.fit(X)
[23]:
OrdinalEncoder()
[24]:
o = t.transform(X)
o
[24]:
array([[0., 1.],
       [1., 0.],
       [0., 2.],
       [1., 3.]])
[25]:
t.inverse_transform(o)
[25]:
array([['boy', 2],
       ['girl', 1],
       ['boy', 3],
       ['girl', 4]], dtype=object)

11.11. Polynomial features

[26]:
from sklearn.preprocessing import PolynomialFeatures

X = [[0, 1],
     [2, 3],
     [4, 5]]

t = PolynomialFeatures(2)
t.fit(X)
[26]:
PolynomialFeatures()
[27]:
t.transform(X)
[27]:
array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

11.12. Power transformer

[28]:
from sklearn.preprocessing import PowerTransformer

X = [[0, 1],
     [2, 3],
     [4, 5]]

t = PowerTransformer()
t.fit(X)
[28]:
PowerTransformer()
[29]:
o = t.transform(X)
o
[29]:
array([[-1.28608295, -1.26755013],
       [ 0.13363692,  0.09064754],
       [ 1.15244602,  1.17690259]])
[30]:
t.inverse_transform(o)
[30]:
array([[0., 1.],
       [2., 3.],
       [4., 5.]])

11.13. Robust scaler

[31]:
from sklearn.preprocessing import RobustScaler

X = [[1.,-2.,2.],
     [ -2.,1.,3.],
     [ 4.,1.,-2.]]

t = RobustScaler(with_centering=False)
t.fit(X)
[31]:
RobustScaler(with_centering=False)
[32]:
o = t.transform(X)
o
[32]:
array([[ 0.33333333, -1.33333333,  0.8       ],
       [-0.66666667,  0.66666667,  1.2       ],
       [ 1.33333333,  0.66666667, -0.8       ]])
[33]:
t.inverse_transform(o)
[33]:
array([[ 1., -2.,  2.],
       [-2.,  1.,  3.],
       [ 4.,  1., -2.]])

11.14. Standard scaler

[34]:
from sklearn.preprocessing import StandardScaler

X = [[0, 0],
     [0, 0],
     [1, 1],
     [1, 1]]

t = StandardScaler()
t.fit(X)
[34]:
StandardScaler()
[35]:
o = t.transform(X)
o
[35]:
array([[-1., -1.],
       [-1., -1.],
       [ 1.,  1.],
       [ 1.,  1.]])
[36]:
t.inverse_transform(o)
[36]:
array([[0., 0.],
       [0., 0.],
       [1., 1.],
       [1., 1.]])

11.15. Column transformation

ColumnTransformer can be used with a list of transformers to transform a data set.

[37]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np

np.random.seed(37)

X = np.random.normal(10, 1, 10).reshape((5, 2))

preprocessor = ColumnTransformer([
    ('log', FunctionTransformer(np.log), [0]),
    ('round', FunctionTransformer(np.round), [1])
])

X, preprocessor.fit_transform(X)
[37]:
(array([[ 9.94553639, 10.67430807],
        [10.34664703,  8.69965383],
        [11.51851188, 10.98982371],
        [10.2776809 ,  9.55141065],
        [10.96196624,  9.17242136]]),
 array([[ 2.29712385, 11.        ],
        [ 2.33666251,  9.        ],
        [ 2.44395547, 11.        ],
        [ 2.32997464, 10.        ],
        [ 2.39443167,  9.        ]]))

A Pandas dataframe can also be transformed with ColumnTransformer as well.

[38]:
import pandas as pd

df = pd.DataFrame(X, columns=['x1', 'x2'])

preprocessor = ColumnTransformer([
    ('log', FunctionTransformer(np.log), ['x1']),
    ('round', FunctionTransformer(np.round), ['x2'])
])
preprocessor.fit_transform(df)
[38]:
array([[ 2.29712385, 11.        ],
       [ 2.33666251,  9.        ],
       [ 2.44395547, 11.        ],
       [ 2.32997464, 10.        ],
       [ 2.39443167,  9.        ]])