Scalers: Standard, MinMax, Robust

1 minute read

LibrariesPermalink

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neighbors import KernelDensity
from scipy.interpolate import interp1d

Load dataPermalink

data = load_boston()
X = data['data']
y = data['target']
data['feature_names']
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

ScalersPermalink

for col in range(X.shape[1]):
    print(col, np.unique(X[:,col]).size)
0 504
1 26
2 76
3 2
4 81
5 446
6 356
7 412
8 9
9 66
10 46
11 357
12 455
array = X[:,-1].copy()
plt.boxplot(array)
plt.show()

output_10_0

from sklearn.neighbors import KernelDensity

kd = KernelDensity(bandwidth=1.5, breadth_first=True, kernel='gaussian')
kd.fit(array[:,np.newaxis])
array_density = np.exp(kd.score_samples(array[:,np.newaxis]))
  • Calculate the area under histogram
n_bins = 30
counts, bins, patchs = plt.hist(array, alpha=.3, bins=n_bins,)
area = sum(counts * np.diff(bins))

output_13_0

f = interp1d(array, array_density, kind='linear')
x = np.linspace(array.min(),array.max(), 1000)
fx = f(x)

Standard scalerPermalink

Xscaled=XμσXscaled=Xμσ

scaler = StandardScaler()
transformed = scaler.fit_transform(array.reshape(-1,1))
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(12,12))
ax1.hist(array, bins=n_bins)
ax1.plot(x, fx*area, linewidth=2, color = 'r')
ax2.hist(transformed, bins=n_bins)

ax3.boxplot(array)
ax4.boxplot(transformed)
plt.show()

output_18_0

mean = array.mean()
std = array.std()
transformed1 = ((array - mean)/std).reshape(-1, 1)
  • verify my source code
np.alltrue(transformed == transformed1)
True

MinMax scalerPermalink

Xscaled=Xminmaxmin

scaler = MinMaxScaler()
transformed = scaler.fit_transform(array.reshape(-1,1))
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(12,12))
ax1.hist(array, bins=n_bins)
ax1.plot(x, fx*area, linewidth=2, color = 'r')
ax2.hist(transformed, bins=n_bins)

ax3.boxplot(array)
ax4.boxplot(transformed)
plt.show()

output_25_0

Min = array.min()
Max = array.max()
diff = Max - Min
transformed1 = ((array - Min)/diff).reshape(-1, 1)
  • verify my source code
np.allclose(transformed, transformed1)
True

Robust scalerPermalink

Xscaled=X˜XIQR=XQ2Q3Q1

scaler = RobustScaler()
transformed = scaler.fit_transform(array.reshape(-1,1))
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2,2,figsize=(12,12))
ax1.hist(array, bins=n_bins)
ax1.plot(x, fx*area, linewidth=2, color = 'r')
ax2.hist(transformed, bins=n_bins)

ax3.boxplot(array)
ax4.boxplot(transformed)
plt.show()

output_32_0

median = np.median(array)
q1, q3 = np.quantile(array, q = [.25, .75])
iqr = q3 - q1
transformed1 = ((array - median)/iqr).reshape(-1, 1)
  • verify my source code
np.alltrue(transformed == transformed1)
True

Leave a comment