├── .gitignore ├── LICENSE ├── README.md └── gauss_rank_scaler.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .ipynb_checkpoints/ 3 | *.DS_Store 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Youngmin Kim 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gauss Rank Scaler 2 | 3 | A scikit-learn style transformer that scales numeric variables to normal distributions. 4 | 5 | Input normalization for neural networks is very important. Gauss Rank is an effective algorithm for converting numeric variable distributions to normals. It is based on rank transformation. The first step is to assign a spacing between -1 and 1 to the sorted features, then apply the inverse of error function `erfinv` to make it look like a Gaussian. 6 | 7 | ![](https://aldente0630.github.io/assets/gauss_rank_scaler3.png) 8 | 9 | This generally works much better than Standard or Min Max Scaler. 10 | 11 | ## Important Links 12 | 13 | * [Interview of the Kaggle competition winner (Michael Jahrer)](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927) 14 | * [Blog post introducing Gauss Rank's concept and simple implementation (Zygmunt Zając)](http://fastml.com/preparing-continuous-features-for-neural-networks-with-rankgauss) 15 | 16 | ## Usage 17 | 18 | Gauss Rank Scaler is a fully compatible sklearn transformer that can be used in pipelines or existing scripts. Supported input formats include numpy arrays and pandas dataframes. All columns passed to the transformer are properly scaled. 19 | 20 | ## Example 21 | 22 | ```python 23 | from gauss_rank_scaler.gauss_rank_scaler import GaussRankScaler 24 | import pandas as pd 25 | from sklearn.datasets import load_boston 26 | %matplotlib inline 27 | 28 | # prepare some data 29 | bunch = load_boston() 30 | df_X_train = pd.DataFrame(bunch.data[:250], columns=bunch.feature_names) 31 | df_X_test = pd.DataFrame(bunch.data[250:], columns=bunch.feature_names) 32 | 33 | # plot histograms of two numeric variables 34 | _ = df_X_train[['CRIM', 'DIS']].hist() 35 | ``` 36 | ![](https://aldente0630.github.io/assets/gauss_rank_scaler1.png) 37 | ```python 38 | # scale the numeric variables with Gauss Rank Scaler 39 | scaler = GaussRankScaler() 40 | df_X_new_train = scaler.fit_transform(df_X_train[['CRIM', 'DIS']]) 41 | 42 | # plot histograms of the scaled variables 43 | _ = pd.DataFrame(df_X_new_train, columns=['CRIM', 'DIS']).hist() 44 | ``` 45 | ![](https://aldente0630.github.io/assets/gauss_rank_scaler2.png) 46 | ```python 47 | # scale test dataset with the fitted scaler 48 | df_X_new_test = scaler.transform(df_X_test[['CRIM', 'DIS']]) 49 | ``` 50 | -------------------------------------------------------------------------------- /gauss_rank_scaler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from joblib import Parallel, delayed 3 | from scipy.interpolate import interp1d 4 | from scipy.special import erf, erfinv 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted 7 | 8 | 9 | class GaussRankScaler(BaseEstimator, TransformerMixin): 10 | """Transform features by scaling each feature to a normal distribution. 11 | Parameters 12 | ---------- 13 | epsilon : float, optional, default 1e-4 14 | A small amount added to the lower bound or subtracted 15 | from the upper bound. This value prevents infinite number 16 | from occurring when applying the inverse error function. 17 | copy : boolean, optional, default True 18 | If False, try to avoid a copy and do inplace scaling instead. 19 | This is not guaranteed to always work inplace; e.g. if the data is 20 | not a NumPy array, a copy may still be returned. 21 | n_jobs : int or None, optional, default None 22 | Number of jobs to run in parallel. 23 | ``None`` means 1 and ``-1`` means using all processors. 24 | interp_kind : str or int, optional, default 'linear' 25 | Specifies the kind of interpolation as a string 26 | ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 27 | 'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic' 28 | refer to a spline interpolation of zeroth, first, second or third 29 | order; 'previous' and 'next' simply return the previous or next value 30 | of the point) or as an integer specifying the order of the spline 31 | interpolator to use. 32 | interp_copy : bool, optional, default False 33 | If True, the interpolation function makes internal copies of x and y. 34 | If False, references to `x` and `y` are used. 35 | Attributes 36 | ---------- 37 | interp_func_ : list 38 | The interpolation function for each feature in the training set. 39 | """ 40 | 41 | def __init__( 42 | self, 43 | epsilon=1e-4, 44 | copy=True, 45 | n_jobs=None, 46 | interp_kind="linear", 47 | interp_copy=False, 48 | ): 49 | self.epsilon = epsilon 50 | self.copy = copy 51 | self.interp_kind = interp_kind 52 | self.interp_copy = interp_copy 53 | self.fill_value = "extrapolate" 54 | self.n_jobs = n_jobs 55 | self.bound = 1.0 - self.epsilon 56 | 57 | def fit(self, X, y=None): 58 | """Fit interpolation function to link rank with original data for future scaling 59 | Parameters 60 | ---------- 61 | X : array-like, shape (n_samples, n_features) 62 | The data used to fit interpolation function for later scaling along the features axis. 63 | y 64 | Ignored 65 | """ 66 | X = check_array( 67 | X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True 68 | ) 69 | 70 | self.interp_func_ = Parallel(n_jobs=self.n_jobs)( 71 | delayed(self._fit)(x) for x in X.T 72 | ) 73 | return self 74 | 75 | def _fit(self, x): 76 | x = self.drop_duplicates(x) 77 | rank = np.argsort(np.argsort(x)) 78 | factor = np.max(rank) / 2.0 * self.bound 79 | scaled_rank = np.clip(rank / factor - self.bound, -self.bound, self.bound) 80 | return interp1d( 81 | x, 82 | scaled_rank, 83 | kind=self.interp_kind, 84 | copy=self.interp_copy, 85 | fill_value=self.fill_value, 86 | ) 87 | 88 | def transform(self, X, copy=None): 89 | """Scale the data with the Gauss Rank algorithm 90 | Parameters 91 | ---------- 92 | X : array-like, shape (n_samples, n_features) 93 | The data used to scale along the features axis. 94 | copy : bool, optional (default: None) 95 | Copy the input X or not. 96 | """ 97 | check_is_fitted(self, "interp_func_") 98 | 99 | copy = copy if copy is not None else self.copy 100 | X = check_array( 101 | X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True 102 | ) 103 | 104 | X = np.array( 105 | Parallel(n_jobs=self.n_jobs)( 106 | delayed(self._transform)(i, x) for i, x in enumerate(X.T) 107 | ) 108 | ).T 109 | return X 110 | 111 | def _transform(self, i, x): 112 | clipped = np.clip(self.interp_func_[i](x), -self.bound, self.bound) 113 | return erfinv(clipped) 114 | 115 | def inverse_transform(self, X, copy=None): 116 | """Scale back the data to the original representation 117 | Parameters 118 | ---------- 119 | X : array-like, shape [n_samples, n_features] 120 | The data used to scale along the features axis. 121 | copy : bool, optional (default: None) 122 | Copy the input X or not. 123 | """ 124 | check_is_fitted(self, "interp_func_") 125 | 126 | copy = copy if copy is not None else self.copy 127 | X = check_array( 128 | X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True 129 | ) 130 | 131 | X = np.array( 132 | Parallel(n_jobs=self.n_jobs)( 133 | delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T) 134 | ) 135 | ).T 136 | return X 137 | 138 | def _inverse_transform(self, i, x): 139 | inv_interp_func = interp1d( 140 | self.interp_func_[i].y, 141 | self.interp_func_[i].x, 142 | kind=self.interp_kind, 143 | copy=self.interp_copy, 144 | fill_value=self.fill_value, 145 | ) 146 | return inv_interp_func(erf(x)) 147 | 148 | @staticmethod 149 | def drop_duplicates(x): 150 | is_unique = np.zeros_like(x, dtype=bool) 151 | is_unique[np.unique(x, return_index=True)[1]] = True 152 | return x[is_unique] 153 | --------------------------------------------------------------------------------