Avoid Scaling Binary Columns In Sci-kit Learn Standsardscaler
Solution 1:
You should create a custom scaler which ignores the last two columns while scaling.
from sklearn.base import TransformerMixin
import numpy as np
classCustomScaler(TransformerMixin): def__init__(self):
self.scaler = StandardScaler()
deffit(self, X, y):
self.scaler.fit(X[:, :-2], y)
returnselfdeftransform(self, X):
X_head = self.scaler.transform(X[:, :-2])
return np.concatenate(X_head, X[:, -2:], axis=1)
Solution 2:
I'm posting code that I adapted from @miindlek's response just in case it is helpful to others. I encountered an error when I didn't include BaseEstimator. Thank you again @miindlek. Below, bin_vars_index is an array of column indexes for the binary variable and cont_vars_index is the same for the continuous variables that you want to scale.
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
classCustomScaler(BaseEstimator,TransformerMixin):
# note: returns the feature matrix with the binary columns ordered first def__init__(self,bin_vars_index,cont_vars_index,copy=True,with_mean=True,with_std=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.bin_vars_index = bin_vars_index
self.cont_vars_index = cont_vars_index
deffit(self, X, y=None):
self.scaler.fit(X[:,self.cont_vars_index], y)
return self
deftransform(self, X, y=None, copy=None):
X_tail = self.scaler.transform(X[:,self.cont_vars_index],y,copy)
return np.concatenate((X[:,self.bin_vars_index],X_tail), axis=1)
Solution 3:
Your pipeline should change into:
from sklearn.preprocessing import StandardScaler,FunctionTransformer
from sklearn.pipeline import Pipeline,FeatureUnion
pipeline=Pipeline(steps= [
('feature_processing', FeatureUnion(transformer_list = [
('categorical', FunctionTransformer(lambda data: data[:, cat_indices])),
#numeric
('numeric', Pipeline(steps = [
('select', FunctionTransformer(lambda data: data[:, num_indices])),
('scale', StandardScaler())
]))
])),
('clf', Ridge())
]
)
Solution 4:
I have adapted @J_C code a bit to work with pandas data frame. You can pass column names that you want to scale and you get result with initial column order.
enter code here
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
classCustomScaler(BaseEstimator,TransformerMixin):
def__init__(self,columns,copy=True,with_mean=True,with_std=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.columns = columns
deffit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
return self
deftransform(self, X, y=None, copy=None):
init_col_order = X.columns
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
X_not_scaled = X.ix[:,~X.columns.isin(self.columns)]
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
Usage:
scale = CustomScaler(columns=['duration', 'num_operations'])
scaled = scale.fit_transform(churn_d)
Solution 5:
I found the concatenation in @Vitaliy Grabovets dataframe version doesn't work properly unless you specify the index for X_scaled. So the relevant line now reads:
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns, index=X.index)
Post a Comment for "Avoid Scaling Binary Columns In Sci-kit Learn Standsardscaler"