# 11. Data Preprocessing

## 11.1. Binarizer

[1]:

from sklearn.preprocessing import Binarizer

X = [[1.0],
[-1.0],
[1.0],
[-1.0]]

t = Binarizer()
t.fit(X)
t.transform(X)

[1]:

array([[1.],
[0.],
[1.],
[0.]])


## 11.2. K-bins discretizer

[2]:

from sklearn.preprocessing import KBinsDiscretizer

X = [[-2.0, 2.0],
[-1.0, 1.0],
[0.0, 0.0],
[1.0, -1.0],
[2.0, -2.0]]

t = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
t.fit(X)
t.transform(X)

[2]:

array([[0., 2.],
[0., 2.],
[1., 1.],
[2., 0.],
[2., 0.]])


## 11.3. Label binarizer

[3]:

from sklearn.preprocessing import LabelBinarizer

X = [0, 0, 2, 2, 1, 1]

t = LabelBinarizer()
t.fit([0, 1, 2])

[3]:

LabelBinarizer()

[4]:

o = t.transform(X)
o

[4]:

array([[1, 0, 0],
[1, 0, 0],
[0, 0, 1],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0]])

[5]:

t.inverse_transform(o)

[5]:

array([0, 0, 2, 2, 1, 1])


## 11.4. Label encoder

[6]:

from sklearn.preprocessing import LabelEncoder

X = [99, 100, 200]

t = LabelEncoder()
t.fit(X)

[6]:

LabelEncoder()

[7]:

o = t.transform([99, 99, 100, 200, 100, 200])
o

[7]:

array([0, 0, 1, 2, 1, 2])

[8]:

t.inverse_transform(o)

[8]:

array([ 99,  99, 100, 200, 100, 200])


## 11.5. Multi-label binarizer

[9]:

from sklearn.preprocessing import MultiLabelBinarizer

X = [[3, 4], [5], [6]]

t = MultiLabelBinarizer()
t.fit(X)

[9]:

MultiLabelBinarizer()

[10]:

o = t.transform(X)
o

[10]:

array([[1, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]])

[11]:

t.inverse_transform(o)

[11]:

[(3, 4), (5,), (6,)]


## 11.6. Maximum absolute scaler

[12]:

from sklearn.preprocessing import MaxAbsScaler

X = [[0, -1, 0],
[1, 0, -1],
[2, 1, 2]]

t = MaxAbsScaler()
t.fit(X)

[12]:

MaxAbsScaler()

[13]:

o = t.transform(X)
o

[13]:

array([[ 0. , -1. ,  0. ],
[ 0.5,  0. , -0.5],
[ 1. ,  1. ,  1. ]])

[14]:

t.inverse_transform(o)

[14]:

array([[ 0., -1.,  0.],
[ 1.,  0., -1.],
[ 2.,  1.,  2.]])


## 11.7. Min-max scaler

[15]:

from sklearn.preprocessing import MinMaxScaler

X = [[0, 10, 0],
[1, 5, -1],
[2, 0, -2]]

t = MinMaxScaler()
t.fit(X)

[15]:

MinMaxScaler()

[16]:

o = t.transform(X)
o

[16]:

array([[0. , 1. , 1. ],
[0.5, 0.5, 0.5],
[1. , 0. , 0. ]])

[17]:

t.inverse_transform(o)

[17]:

array([[ 0., 10.,  0.],
[ 1.,  5., -1.],
[ 2.,  0., -2.]])


## 11.8. Normalizer

[18]:

from sklearn.preprocessing import Normalizer

X = [[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]]

t = Normalizer()
t.fit(X)

[18]:

Normalizer()

[19]:

t.transform(X)

[19]:

array([[0.18257419, 0.36514837, 0.54772256, 0.73029674],
[0.37904902, 0.45485883, 0.53066863, 0.60647843],
[0.42616235, 0.47351372, 0.5208651 , 0.56821647]])


## 11.9. One-hot encoder

[20]:

from sklearn.preprocessing import OneHotEncoder

X = [['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]]

t = OneHotEncoder(handle_unknown='ignore')
t.fit(X)

[20]:

OneHotEncoder(handle_unknown='ignore')

[21]:

o = t.transform(X).todense()
o

[21]:

matrix([[1., 0., 0., 1., 0., 0.],
[0., 1., 1., 0., 0., 0.],
[1., 0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0., 1.]])

[22]:

t.inverse_transform(o)

[22]:

array([['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]], dtype=object)


## 11.10. Ordinal encoder

[23]:

from sklearn.preprocessing import OrdinalEncoder

X = [['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]]

t = OrdinalEncoder()
t.fit(X)

[23]:

OrdinalEncoder()

[24]:

o = t.transform(X)
o

[24]:

array([[0., 1.],
[1., 0.],
[0., 2.],
[1., 3.]])

[25]:

t.inverse_transform(o)

[25]:

array([['boy', 2],
['girl', 1],
['boy', 3],
['girl', 4]], dtype=object)


## 11.11. Polynomial features

[26]:

from sklearn.preprocessing import PolynomialFeatures

X = [[0, 1],
[2, 3],
[4, 5]]

t = PolynomialFeatures(2)
t.fit(X)

[26]:

PolynomialFeatures()

[27]:

t.transform(X)

[27]:

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
[ 1.,  2.,  3.,  4.,  6.,  9.],
[ 1.,  4.,  5., 16., 20., 25.]])


## 11.12. Power transformer

[28]:

from sklearn.preprocessing import PowerTransformer

X = [[0, 1],
[2, 3],
[4, 5]]

t = PowerTransformer()
t.fit(X)

[28]:

PowerTransformer()

[29]:

o = t.transform(X)
o

[29]:

array([[-1.28608295, -1.26755013],
[ 0.13363692,  0.09064754],
[ 1.15244602,  1.17690259]])

[30]:

t.inverse_transform(o)

[30]:

array([[0., 1.],
[2., 3.],
[4., 5.]])


## 11.13. Robust scaler

[31]:

from sklearn.preprocessing import RobustScaler

X = [[1.,-2.,2.],
[ -2.,1.,3.],
[ 4.,1.,-2.]]

t = RobustScaler(with_centering=False)
t.fit(X)

[31]:

RobustScaler(with_centering=False)

[32]:

o = t.transform(X)
o

[32]:

array([[ 0.33333333, -1.33333333,  0.8       ],
[-0.66666667,  0.66666667,  1.2       ],
[ 1.33333333,  0.66666667, -0.8       ]])

[33]:

t.inverse_transform(o)

[33]:

array([[ 1., -2.,  2.],
[-2.,  1.,  3.],
[ 4.,  1., -2.]])


## 11.14. Standard scaler

[34]:

from sklearn.preprocessing import StandardScaler

X = [[0, 0],
[0, 0],
[1, 1],
[1, 1]]

t = StandardScaler()
t.fit(X)

[34]:

StandardScaler()

[35]:

o = t.transform(X)
o

[35]:

array([[-1., -1.],
[-1., -1.],
[ 1.,  1.],
[ 1.,  1.]])

[36]:

t.inverse_transform(o)

[36]:

array([[0., 0.],
[0., 0.],
[1., 1.],
[1., 1.]])


## 11.15. Column transformation

ColumnTransformer can be used with a list of transformers to transform a data set.

[37]:

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np

np.random.seed(37)

X = np.random.normal(10, 1, 10).reshape((5, 2))

preprocessor = ColumnTransformer([
('log', FunctionTransformer(np.log), [0]),
('round', FunctionTransformer(np.round), [1])
])

X, preprocessor.fit_transform(X)

[37]:

(array([[ 9.94553639, 10.67430807],
[10.34664703,  8.69965383],
[11.51851188, 10.98982371],
[10.2776809 ,  9.55141065],
[10.96196624,  9.17242136]]),
array([[ 2.29712385, 11.        ],
[ 2.33666251,  9.        ],
[ 2.44395547, 11.        ],
[ 2.32997464, 10.        ],
[ 2.39443167,  9.        ]]))


A Pandas dataframe can also be transformed with ColumnTransformer as well.

[38]:

import pandas as pd

df = pd.DataFrame(X, columns=['x1', 'x2'])

preprocessor = ColumnTransformer([
('log', FunctionTransformer(np.log), ['x1']),
('round', FunctionTransformer(np.round), ['x2'])
])
preprocessor.fit_transform(df)

[38]:

array([[ 2.29712385, 11.        ],
[ 2.33666251,  9.        ],
[ 2.44395547, 11.        ],
[ 2.32997464, 10.        ],
[ 2.39443167,  9.        ]])