Scalers: Standard, MinMax, Robust
Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neighbors import KernelDensity
from scipy.interpolate import interp1d
Load data
data = load_boston()
X = data['data']
y = data['target']
data['feature_names']
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
Scalers
for col in range(X.shape[1]):
print(col, np.unique(X[:,col]).size)
0 504
1 26
2 76
3 2
4 81
5 446
6 356
7 412
8 9
9 66
10 46
11 357
12 455
array = X[:,-1].copy()
plt.boxplot(array)
plt.show()
from sklearn.neighbors import KernelDensity
kd = KernelDensity(bandwidth=1.5, breadth_first=True, kernel='gaussian')
kd.fit(array[:,np.newaxis])
array_density = np.exp(kd.score_samples(array[:,np.newaxis]))
- Calculate the area under histogram
n_bins = 30
counts, bins, patchs = plt.hist(array, alpha=.3, bins=n_bins,)
area = sum(counts * np.diff(bins))
f = interp1d(array, array_density, kind='linear')
x = np.linspace(array.min(),array.max(), 1000)
fx = f(x)
Standard scaler
$\large \displaystyle X_{scaled} = \frac{X - \mu}{\sigma}$
scaler = StandardScaler()
transformed = scaler.fit_transform(array.reshape(-1,1))
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(12,12))
ax1.hist(array, bins=n_bins)
ax1.plot(x, fx*area, linewidth=2, color = 'r')
ax2.hist(transformed, bins=n_bins)
ax3.boxplot(array)
ax4.boxplot(transformed)
plt.show()
mean = array.mean()
std = array.std()
transformed1 = ((array - mean)/std).reshape(-1, 1)
- verify my source code
np.alltrue(transformed == transformed1)
True
MinMax scaler
$\large \displaystyle X_{scaled} = \frac{X - min}{max - min}$
scaler = MinMaxScaler()
transformed = scaler.fit_transform(array.reshape(-1,1))
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(12,12))
ax1.hist(array, bins=n_bins)
ax1.plot(x, fx*area, linewidth=2, color = 'r')
ax2.hist(transformed, bins=n_bins)
ax3.boxplot(array)
ax4.boxplot(transformed)
plt.show()
Min = array.min()
Max = array.max()
diff = Max - Min
transformed1 = ((array - Min)/diff).reshape(-1, 1)
- verify my source code
np.allclose(transformed, transformed1)
True
Robust scaler
$\large \displaystyle X_{scaled} = \frac{X - \tilde{X}}{IQR} = \frac{X - Q_2}{Q_3 - Q_1}$
scaler = RobustScaler()
transformed = scaler.fit_transform(array.reshape(-1,1))
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(12,12))
ax1.hist(array, bins=n_bins)
ax1.plot(x, fx*area, linewidth=2, color = 'r')
ax2.hist(transformed, bins=n_bins)
ax3.boxplot(array)
ax4.boxplot(transformed)
plt.show()
median = np.median(array)
q1, q3 = np.quantile(array, q = [.25, .75])
iqr = q3 - q1
transformed1 = ((array - median)/iqr).reshape(-1, 1)
- verify my source code
np.alltrue(transformed == transformed1)
True
Leave a comment