├── .gitignore
├── LICENSE
├── README.md
└── gauss_rank_scaler.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .ipynb_checkpoints/
3 | *.DS_Store
4 | __pycache__
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Youngmin Kim
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gauss Rank Scaler
 2 |   
 3 | A scikit-learn style transformer that scales numeric variables to normal distributions. 
 4 | 
 5 | Input normalization for neural networks is very important. Gauss Rank is an effective algorithm for converting numeric variable distributions to normals. It is based on rank transformation. The first step is to assign a spacing between -1 and 1 to the sorted features, then apply the inverse of error function `erfinv` to make it look like a Gaussian. 
 6 |   
 7 | ![](https://aldente0630.github.io/assets/gauss_rank_scaler3.png)
 8 |   
 9 | This generally works much better than Standard or Min Max Scaler.
10 |   
11 | ## Important Links
12 |   
13 | * [Interview of the Kaggle competition winner (Michael Jahrer)](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927)  
14 | * [Blog post introducing Gauss Rank's concept and simple implementation (Zygmunt Zając)](http://fastml.com/preparing-continuous-features-for-neural-networks-with-rankgauss)
15 |   
16 | ## Usage
17 | 
18 | Gauss Rank Scaler is a fully compatible sklearn transformer that can be used in pipelines or existing scripts. Supported input formats include numpy arrays and pandas dataframes. All columns passed to the transformer are properly scaled.
19 | 
20 | ## Example
21 | 
22 | ```python
23 | from gauss_rank_scaler.gauss_rank_scaler import GaussRankScaler
24 | import pandas as pd
25 | from sklearn.datasets import load_boston
26 | %matplotlib inline
27 | 
28 | # prepare some data
29 | bunch = load_boston()
30 | df_X_train = pd.DataFrame(bunch.data[:250], columns=bunch.feature_names)
31 | df_X_test = pd.DataFrame(bunch.data[250:], columns=bunch.feature_names)
32 | 
33 | # plot histograms of two numeric variables
34 | _ = df_X_train[['CRIM', 'DIS']].hist()
35 | ```
36 | ![](https://aldente0630.github.io/assets/gauss_rank_scaler1.png)
37 | ```python
38 | # scale the numeric variables with Gauss Rank Scaler
39 | scaler = GaussRankScaler()
40 | df_X_new_train = scaler.fit_transform(df_X_train[['CRIM', 'DIS']])
41 | 
42 | # plot histograms of the scaled variables
43 | _ = pd.DataFrame(df_X_new_train, columns=['CRIM', 'DIS']).hist()
44 | ```
45 | ![](https://aldente0630.github.io/assets/gauss_rank_scaler2.png)
46 | ```python
47 | # scale test dataset with the fitted scaler
48 | df_X_new_test = scaler.transform(df_X_test[['CRIM', 'DIS']])
49 | ```
50 | 


--------------------------------------------------------------------------------
/gauss_rank_scaler.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from joblib import Parallel, delayed
  3 | from scipy.interpolate import interp1d
  4 | from scipy.special import erf, erfinv
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
  7 | 
  8 | 
  9 | class GaussRankScaler(BaseEstimator, TransformerMixin):
 10 |     """Transform features by scaling each feature to a normal distribution.
 11 |     Parameters
 12 |         ----------
 13 |         epsilon : float, optional, default 1e-4
 14 |             A small amount added to the lower bound or subtracted
 15 |             from the upper bound. This value prevents infinite number
 16 |             from occurring when applying the inverse error function.
 17 |         copy : boolean, optional, default True
 18 |             If False, try to avoid a copy and do inplace scaling instead.
 19 |             This is not guaranteed to always work inplace; e.g. if the data is
 20 |             not a NumPy array, a copy may still be returned.
 21 |         n_jobs : int or None, optional, default None
 22 |             Number of jobs to run in parallel.
 23 |             ``None`` means 1 and ``-1`` means using all processors.
 24 |         interp_kind : str or int, optional, default 'linear'
 25 |            Specifies the kind of interpolation as a string
 26 |             ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
 27 |             'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
 28 |             refer to a spline interpolation of zeroth, first, second or third
 29 |             order; 'previous' and 'next' simply return the previous or next value
 30 |             of the point) or as an integer specifying the order of the spline
 31 |             interpolator to use.
 32 |         interp_copy : bool, optional, default False
 33 |             If True, the interpolation function makes internal copies of x and y.
 34 |             If False, references to `x` and `y` are used.
 35 |         Attributes
 36 |         ----------
 37 |         interp_func_ : list
 38 |             The interpolation function for each feature in the training set.
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         epsilon=1e-4,
 44 |         copy=True,
 45 |         n_jobs=None,
 46 |         interp_kind="linear",
 47 |         interp_copy=False,
 48 |     ):
 49 |         self.epsilon = epsilon
 50 |         self.copy = copy
 51 |         self.interp_kind = interp_kind
 52 |         self.interp_copy = interp_copy
 53 |         self.fill_value = "extrapolate"
 54 |         self.n_jobs = n_jobs
 55 |         self.bound = 1.0 - self.epsilon
 56 | 
 57 |     def fit(self, X, y=None):
 58 |         """Fit interpolation function to link rank with original data for future scaling
 59 |         Parameters
 60 |         ----------
 61 |         X : array-like, shape (n_samples, n_features)
 62 |             The data used to fit interpolation function for later scaling along the features axis.
 63 |         y
 64 |             Ignored
 65 |         """
 66 |         X = check_array(
 67 |             X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True
 68 |         )
 69 | 
 70 |         self.interp_func_ = Parallel(n_jobs=self.n_jobs)(
 71 |             delayed(self._fit)(x) for x in X.T
 72 |         )
 73 |         return self
 74 | 
 75 |     def _fit(self, x):
 76 |         x = self.drop_duplicates(x)
 77 |         rank = np.argsort(np.argsort(x))
 78 |         factor = np.max(rank) / 2.0 * self.bound
 79 |         scaled_rank = np.clip(rank / factor - self.bound, -self.bound, self.bound)
 80 |         return interp1d(
 81 |             x,
 82 |             scaled_rank,
 83 |             kind=self.interp_kind,
 84 |             copy=self.interp_copy,
 85 |             fill_value=self.fill_value,
 86 |         )
 87 | 
 88 |     def transform(self, X, copy=None):
 89 |         """Scale the data with the Gauss Rank algorithm
 90 |         Parameters
 91 |         ----------
 92 |         X : array-like, shape (n_samples, n_features)
 93 |             The data used to scale along the features axis.
 94 |         copy : bool, optional (default: None)
 95 |             Copy the input X or not.
 96 |         """
 97 |         check_is_fitted(self, "interp_func_")
 98 | 
 99 |         copy = copy if copy is not None else self.copy
100 |         X = check_array(
101 |             X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True
102 |         )
103 | 
104 |         X = np.array(
105 |             Parallel(n_jobs=self.n_jobs)(
106 |                 delayed(self._transform)(i, x) for i, x in enumerate(X.T)
107 |             )
108 |         ).T
109 |         return X
110 | 
111 |     def _transform(self, i, x):
112 |         clipped = np.clip(self.interp_func_[i](x), -self.bound, self.bound)
113 |         return erfinv(clipped)
114 | 
115 |     def inverse_transform(self, X, copy=None):
116 |         """Scale back the data to the original representation
117 |         Parameters
118 |         ----------
119 |         X : array-like, shape [n_samples, n_features]
120 |             The data used to scale along the features axis.
121 |         copy : bool, optional (default: None)
122 |             Copy the input X or not.
123 |         """
124 |         check_is_fitted(self, "interp_func_")
125 | 
126 |         copy = copy if copy is not None else self.copy
127 |         X = check_array(
128 |             X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True
129 |         )
130 | 
131 |         X = np.array(
132 |             Parallel(n_jobs=self.n_jobs)(
133 |                 delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T)
134 |             )
135 |         ).T
136 |         return X
137 | 
138 |     def _inverse_transform(self, i, x):
139 |         inv_interp_func = interp1d(
140 |             self.interp_func_[i].y,
141 |             self.interp_func_[i].x,
142 |             kind=self.interp_kind,
143 |             copy=self.interp_copy,
144 |             fill_value=self.fill_value,
145 |         )
146 |         return inv_interp_func(erf(x))
147 | 
148 |     @staticmethod
149 |     def drop_duplicates(x):
150 |         is_unique = np.zeros_like(x, dtype=bool)
151 |         is_unique[np.unique(x, return_index=True)[1]] = True
152 |         return x[is_unique]
153 | 


--------------------------------------------------------------------------------