├── .gitignore ├── MANIFEST.in ├── README.rst ├── pyquantregForest ├── __init__.py ├── pyquantregForest.py └── tests │ ├── __init__.py │ └── test_quantile.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # Compiled files 11 | *.pyd 12 | *.pyc 13 | 14 | # Visual Studio project files 15 | *.pyproj 16 | *.sln 17 | *.suo 18 | 19 | # Files from Google Drive 20 | *.ini 21 | 22 | # Setuptools files 23 | *.egg-info 24 | 25 | # User-specific files (MonoDevelop/Xamarin Studio) 26 | *.userprefs 27 | 28 | # Build results 29 | [Dd]ebug/ 30 | [Dd]ebugPublic/ 31 | [Rr]elease/ 32 | [Rr]eleases/ 33 | x64/ 34 | x86/ 35 | build/ 36 | bld/ 37 | [Bb]in/ 38 | [Oo]bj/ 39 | 40 | # Visual Studio 2015 cache/options directory 41 | .vs/ 42 | 43 | # MSTest test Results 44 | [Tt]est[Rr]esult*/ 45 | [Bb]uild[Ll]og.* 46 | 47 | # NUNIT 48 | *.VisualState.xml 49 | TestResult.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # DNX 57 | project.lock.json 58 | artifacts/ 59 | 60 | *_i.c 61 | *_p.c 62 | *_i.h 63 | *.ilk 64 | *.meta 65 | *.obj 66 | *.pch 67 | *.pdb 68 | *.pgc 69 | *.pgd 70 | *.rsp 71 | *.sbr 72 | *.tlb 73 | *.tli 74 | *.tlh 75 | *.tmp 76 | *.tmp_proj 77 | *.log 78 | *.vspscc 79 | *.vssscc 80 | .builds 81 | *.pidb 82 | *.svclog 83 | *.scc 84 | 85 | # Chutzpah Test files 86 | _Chutzpah* 87 | 88 | # Visual C++ cache files 89 | ipch/ 90 | *.aps 91 | *.ncb 92 | *.opensdf 93 | *.sdf 94 | *.cachefile 95 | 96 | # Visual Studio profiler 97 | *.psess 98 | *.vsp 99 | *.vspx 100 | 101 | # TFS 2012 Local Workspace 102 | $tf/ 103 | 104 | # Guidance Automation Toolkit 105 | *.gpState 106 | 107 | # ReSharper is a .NET coding add-in 108 | _ReSharper*/ 109 | *.[Rr]e[Ss]harper 110 | *.DotSettings.user 111 | 112 | # JustCode is a .NET coding add-in 113 | .JustCode 114 | 115 | # TeamCity is a build add-in 116 | _TeamCity* 117 | 118 | # DotCover is a Code Coverage Tool 119 | *.dotCover 120 | 121 | # NCrunch 122 | _NCrunch_* 123 | .*crunch*.local.xml 124 | 125 | # MightyMoose 126 | *.mm.* 127 | AutoTest.Net/ 128 | 129 | # Web workbench (sass) 130 | .sass-cache/ 131 | 132 | # Installshield output folder 133 | [Ee]xpress/ 134 | 135 | # DocProject is a documentation generator add-in 136 | DocProject/buildhelp/ 137 | DocProject/Help/*.HxT 138 | DocProject/Help/*.HxC 139 | DocProject/Help/*.hhc 140 | DocProject/Help/*.hhk 141 | DocProject/Help/*.hhp 142 | DocProject/Help/Html2 143 | DocProject/Help/html 144 | 145 | # Click-Once directory 146 | publish/ 147 | 148 | # Publish Web Output 149 | *.[Pp]ublish.xml 150 | *.azurePubxml 151 | ## TODO: Comment the next line if you want to checkin your 152 | ## web deploy settings but do note that will include unencrypted 153 | ## passwords 154 | #*.pubxml 155 | 156 | *.publishproj 157 | 158 | # NuGet Packages 159 | *.nupkg 160 | # The packages folder can be ignored because of Package Restore 161 | **/packages/* 162 | # except build/, which is used as an MSBuild target. 163 | !**/packages/build/ 164 | # Uncomment if necessary however generally it will be regenerated when needed 165 | #!**/packages/repositories.config 166 | 167 | # Windows Azure Build Output 168 | csx/ 169 | *.build.csdef 170 | 171 | # Windows Store app package directory 172 | AppPackages/ 173 | 174 | # Visual Studio cache files 175 | # files ending in .cache can be ignored 176 | *.[Cc]ache 177 | # but keep track of directories ending in .cache 178 | !*.[Cc]ache/ 179 | 180 | # Others 181 | ClientBin/ 182 | [Ss]tyle[Cc]op.* 183 | ~$* 184 | *~ 185 | *.dbmdl 186 | *.dbproj.schemaview 187 | *.pfx 188 | *.publishsettings 189 | node_modules/ 190 | orleans.codegen.cs 191 | 192 | # RIA/Silverlight projects 193 | Generated_Code/ 194 | 195 | # Backup & report files from converting an old project file 196 | # to a newer Visual Studio version. Backup files are not needed, 197 | # because we have git ;-) 198 | _UpgradeReport_Files/ 199 | Backup*/ 200 | UpgradeLog*.XML 201 | UpgradeLog*.htm 202 | 203 | # SQL Server files 204 | *.mdf 205 | *.ldf 206 | 207 | # Business Intelligence projects 208 | *.rdl.data 209 | *.bim.layout 210 | *.bim_*.settings 211 | 212 | # Microsoft Fakes 213 | FakesAssemblies/ 214 | 215 | # Node.js Tools for Visual Studio 216 | .ntvs_analysis.dat 217 | 218 | # Visual Studio 6 build log 219 | *.plg 220 | 221 | # Visual Studio 6 workspace options file 222 | *.opt 223 | 224 | # LightSwitch generated files 225 | GeneratedArtifacts/ 226 | _Pvt_Extensions/ 227 | ModelManifest.xml 228 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE.txt 3 | 4 | # Include the data files 5 | recursive-include data * 6 | 7 | # If using Python 2.6 or less, then have to include package data, even though 8 | # it's already declared in setup.py 9 | # include sample/*.dat 10 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | A sample Python project 2 | ======================= 3 | 4 | A sample project that exists as an aid to the `Python Packaging User Guide 5 | `_'s `Tutorial on Packaging and Distributing 6 | Projects `_. 7 | 8 | This projects does not aim to cover best practices for Python project 9 | development as a whole. For example, it does not provide guidance or tool 10 | recommendations for version control, documentation, or testing. 11 | 12 | ---- 13 | 14 | This is the README file for the project. 15 | 16 | The file should use UTF-8 encoding and be written using ReStructured Text. It 17 | will be used to generate the project webpage on PyPI and will be displayed as 18 | the project homepage on common code-hosting services, and should be written for 19 | that purpose. 20 | 21 | Typical contents for this file would include an overview of the project, basic 22 | usage examples, etc. Generally, including the project changelog in here is not 23 | a good idea, although a simple "What's New" section for the most recent version 24 | may be appropriate. 25 | -------------------------------------------------------------------------------- /pyquantregForest/__init__.py: -------------------------------------------------------------------------------- 1 | from .pyquantregForest import QuantileForest 2 | 3 | __all__ = ["QuantileForest"] 4 | -------------------------------------------------------------------------------- /pyquantregForest/pyquantregForest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.ensemble import RandomForestRegressor 3 | from sklearn.ensemble.forest import BaseForest, ForestRegressor 4 | import numpy as np 5 | from scipy.optimize import fmin_cobyla, fmin_slsqp 6 | from pathos.multiprocessing import ProcessingPool 7 | from pandas import DataFrame, Series 8 | import pylab as plt 9 | 10 | __all__ = ["QuantileForest"] 11 | 12 | # ============================================================================= 13 | # Types and constants 14 | # ============================================================================= 15 | 16 | 17 | # ============================================================================= 18 | # Base regression forest 19 | # ============================================================================= 20 | 21 | class QuantileForest(RandomForestRegressor): 22 | """Quantile Regresion Random Forest. 23 | This class can build random forest using Scikit-Learn and compute 24 | conditional quantiles. 25 | 26 | Parameters 27 | ---------- 28 | inputSample : array 29 | Input samples used in data 30 | 31 | outputSample : array 32 | Output samples used in data 33 | 34 | n_estimators : int, optional (default=50) 35 | The number of trees in the forest. 36 | 37 | max_leaf_nodes : int or None, optional (default=max(10, len(outputSample)/100)) 38 | Grow trees with max_leaf_nodes in best-first fashion. Best nodes are 39 | defined as relative reduction in impurity. If None then unlimited 40 | number of leaf nodes. If not None then max_depth will be ignored. 41 | Note: this parameter is tree-specific. 42 | 43 | n_jobs : int, optional (default=4) 44 | The number of jobs to run in parallel for both fit and predict. If -1, 45 | then the number of jobs is set to the number of cores. 46 | 47 | numPoints : int, optional (default=0) 48 | The size of the vector used to determines the quantile. If 0, the 49 | vector use is the outputSample. 50 | 51 | outputSample : string, optional (default="Cobyla") 52 | Name of the Optimisation method to find the alpha-quantile (if the 53 | option is chosen in the computeQuantile method). Only "Cobyla" and 54 | "SQP" are available. 55 | 56 | random_state : int, RandomState instance or None, optional (default=None) 57 | If int, random_state is the seed used by the random number generator; 58 | If RandomState instance, random_state is the random number generator; 59 | If None, the random number generator is the RandomState instance used 60 | by np.random. 61 | """ 62 | 63 | def fit(self, X, y): 64 | """ 65 | 66 | """ 67 | # We transform X as a np array for use convenience 68 | X = np.asarray(X) 69 | 70 | # It's a vector 71 | if X.shape[0] == X.size: 72 | self._n_sample = X.shape[0] 73 | self._input_dim = 1 74 | else: 75 | self._n_sample, self._input_dim = X.shape 76 | 77 | # The bootstrap is mandatory for the method. Since update 78 | # 1.16 of Sklearn, the indices of each element are not 79 | # availables. TODO: find a way to get OOB indices. 80 | self.bootstrap = False 81 | 82 | # Fit the forest 83 | RandomForestRegressor.fit(self, X, y) 84 | 85 | # Save the data. Necessary to compute the quantiles. 86 | self._input_sample = DataFrame(X) 87 | self._output_sample = Series(y) 88 | 89 | # The resulting node of each elements of the sample 90 | self._sample_nodes = DataFrame(self.apply(X)) 91 | 92 | return self 93 | 94 | def _check_input(self, X): 95 | """ 96 | 97 | """ 98 | n = X.shape[0] # Number of sample 99 | try: # Works if X is an array 100 | d = X.shape[1] # Dimension of the array 101 | if d != self._input_dim: # If the dimension is not correct 102 | if n == self._input_dim: # There is one sample of d dimension 103 | d = n 104 | n = 1 105 | else: # Error 106 | raise ValueError("X dimension is different from forest \ 107 | dimension : %d (X) != %d (forest)" % (d, self._input_dim)) 108 | except: # Its a vector 109 | d = 1 110 | if d != self._input_dim: # If the dimension is not correct 111 | if n == self._input_dim: # There is one sample of d dimension 112 | d = n 113 | n = 1 114 | else: # Error 115 | raise ValueError("X dimension is different from forest \ 116 | dimension : %d (X) != %d (forest)" % (d, self._input_dim)) 117 | 118 | if (n > 1) & (d == 1): 119 | X.resize(n, 1) 120 | 121 | return X, n 122 | 123 | def _compute_weight(self, X_nodes_k, i_tree): 124 | """ 125 | """ 126 | if i_tree < 0: 127 | sample_node = self._sample_nodes.values 128 | else: 129 | sample_node = self._nodesOfSamples.values[:, i_tree] 130 | tmp = (sample_node == X_nodes_k) 131 | 132 | # Number of samples in nodes 133 | n_samples_nodes = tmp.sum(axis=0) 134 | 135 | # The proportion in each node 136 | # Shape : Matrix (numSample * numTree) 137 | weight = tmp.astype(float) / n_samples_nodes 138 | 139 | # The weight of each sample in the trees 140 | # Shape : Vector (numSample * ) 141 | if i_tree < 0: 142 | return weight.mean(axis=1) 143 | else: 144 | return weight 145 | 146 | def get_nodes(self, X, i_tree): 147 | """ 148 | """ 149 | X, n_quantiles = self._check_input(X) 150 | 151 | # Nodes of the regressor in all the trees 152 | # Shape : (numTree * numRegressor) 153 | if i_tree < 0: 154 | # Sklearn does not like arrays of one values... 155 | if n_quantiles == 1 and self._input_dim == 1: 156 | X_nodes = self.apply(X[0]).transpose() 157 | else: 158 | X_nodes = self.apply(X).transpose() 159 | else: 160 | tree = self.estimators_[i_tree].tree_ 161 | X_nodes = tree.apply(X.astype(np.float32)) 162 | X_nodes.resize((1, n_quantiles)) 163 | 164 | return X_nodes 165 | 166 | def compute_CDF(self, X, y, i_tree=-1): 167 | """ 168 | """ 169 | if isinstance(X, (int, float)): 170 | X = [X] 171 | if isinstance(y, (int, float)): 172 | y = [y] 173 | 174 | # Converting to array for convenience 175 | X = np.asarray(X) 176 | y = np.asarray(y) 177 | X, n_X = self._check_input(X) 178 | n_y = y.shape[0] 179 | y.resize(n_y, 1) 180 | 181 | self._prepare_CDF() 182 | 183 | CDFs = np.zeros((n_y, n_X)) 184 | X_nodes = self.get_nodes(X, i_tree) 185 | 186 | # For each fixed X 187 | for k in range(n_X): 188 | weight = self._compute_weight(X_nodes[:, k], i_tree) 189 | id_pos = weight > 0 190 | tmp = weight[id_pos] * (self._output_sample.values[id_pos] <= y) 191 | CDFs[:, k] = tmp.sum(axis=1) 192 | print tmp 193 | return CDFs 194 | 195 | def compute_quantile(self, X, alpha, do_optim=True, verbose=False, 196 | doSaveCDF=False, i_tree=-1, opt_method="Cobyla"): 197 | """ 198 | Compute the conditional alpha-quantile. 199 | """ 200 | if isinstance(alpha, float): 201 | alpha = [alpha] 202 | if isinstance(X, (int, float)): 203 | X = [X] 204 | 205 | # Converting to array for convenience 206 | alpha = np.asarray(alpha) 207 | X = np.asarray(X) 208 | 209 | # Number of quantiles to compute 210 | X, n_quantiles = self._check_input(X) 211 | n_alphas = alpha.size # Number of probabilities 212 | 213 | # Matrix of computed quantiles 214 | quantiles = np.zeros((n_quantiles, n_alphas)) 215 | 216 | if doSaveCDF or not do_optim: 217 | self._prepare_CDF() 218 | if doSaveCDF: 219 | self._CDF = np.empty((self._yCDF.size, n_quantiles)) 220 | 221 | X_nodes = self.get_nodes(X, i_tree) 222 | 223 | # For each quantiles to compute 224 | for k in range(n_quantiles): 225 | weight = self._compute_weight(X_nodes[:, k], i_tree) 226 | 227 | # Compute the quantile by minimising the pinball function 228 | if do_optim: 229 | # The starting points are the percentiles 230 | # of the non-zero weights. 231 | y0 = np.percentile(self._output_sample[ 232 | weight != 0], alpha * 100.) 233 | 234 | # For each alpha 235 | for i, alphai in enumerate(alpha): 236 | # The quantile is obtain by the minimisation of the 237 | # weighted check function. 238 | if opt_method == "Cobyla": 239 | quantiles[k, i] = fmin_cobyla(self._min_function, 240 | y0[i], [], 241 | args=(weight, alphai), 242 | disp=verbose) 243 | 244 | elif opt_method == "SQP": 245 | epsilon = 1.E-1 * abs(y0[i]) 246 | quantiles[k, i] = fmin_slsqp(self._min_function, 247 | y0[i], 248 | args=(weight, alphai), 249 | disp=verbose, 250 | epsilon=epsilon) 251 | else: 252 | raise ValueError("Unknow optimisation method %s" % 253 | opt_method) 254 | else: 255 | CDF = self._infYY.dot(weight).ravel() # Compute the CDF 256 | quantiles[k, :] = [self._yCDF.values[CDF >= alphai][0] 257 | for alphai in alpha] 258 | if doSaveCDF: 259 | self._CDF[:, k] = CDF 260 | 261 | if n_quantiles == 1 and n_alphas == 1: 262 | return quantiles[0][0] 263 | elif n_quantiles == 1 or n_alphas == 1: 264 | return quantiles.ravel() 265 | else: 266 | return quantiles 267 | 268 | def _min_function(self, yi, w, alpha): 269 | """ 270 | Minimisation function used to compute the conditional quantiles. 271 | The function need the curret value of $y$, the weight of each observation 272 | and the alpha value. The check function of the residual between $y_i$ and the 273 | output sample, pondered with the weight is minimised. 274 | """ 275 | # Weighted deviation between the current value and the output sample. 276 | # TODO: Think about using only the non-null weight to increases performances 277 | u = w*(self._output_sample.values - yi) 278 | return check_function(u, alpha).sum() 279 | 280 | # ============================================================================== 281 | # Setters 282 | # ============================================================================== 283 | def _prepare_CDF(self): 284 | """ 285 | If the value is set at 0, we will take the quantile from the output 286 | sample. Else we can create new sample to find the quantile 287 | """ 288 | self._yCDF = self._output_sample.sort_values(inplace=False) 289 | 290 | # Matrix of output samples inferior to a quantile value 291 | out_martrix = self._output_sample.reshape(self._n_sample, 1) 292 | cdf_matrix = self._yCDF.reshape(self._yCDF.size, 1).T 293 | self._infYY = DataFrame(out_martrix <= cdf_matrix).T 294 | 295 | def _computeImportanceOfTree(self, alpha, i): 296 | """ 297 | 298 | """ 299 | oob = self._oobID[i] 300 | X_oob = self._inputSample.values[oob, :] 301 | Yobs_oob = self._outputSample.values[oob] 302 | Yest_oob = self.computeQuantile(X_oob, alpha, i_tree=i) 303 | baseError = (check_function(Yobs_oob, Yest_oob, alpha)).mean() 304 | 305 | permError = np.empty(self._input_dim) 306 | for j in range(self._input_dim): 307 | X_oob_perm = np.array(X_oob) 308 | np.random.shuffle(X_oob_perm[:, j]) 309 | Yest_oob_perm = self.computeQuantile(X_oob_perm, alpha, i_tree=i) 310 | permError[j] = check_function(Yobs_oob, Yest_oob_perm, alpha)\ 311 | .mean() 312 | 313 | return (permError - baseError) 314 | 315 | def compute_importance(self, alpha): 316 | """ 317 | 318 | """ 319 | pool = ProcessingPool(self._numJobs) 320 | errors = pool.map(self._computeImportanceOfTree, 321 | [alpha] * self._numTree, range(self._numTree)) 322 | return np.array(errors).mean(axis=0) 323 | 324 | 325 | def check_function(u, alpha): 326 | """ 327 | 328 | """ 329 | return u * (alpha - (u < 0.) * 1.) -------------------------------------------------------------------------------- /pyquantregForest/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NazBen/pyquantregForest/919ab192f04b17ac7e4b85c061bb647663876c94/pyquantregForest/tests/__init__.py -------------------------------------------------------------------------------- /pyquantregForest/tests/test_quantile.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from pyquantregForest import QuantileForest 5 | 6 | def sin_func(X, c=1): 7 | X = np.asarray(X) 8 | return c*np.sin(X) 9 | 10 | def sin_func(X, c=1): 11 | X = np.asarray(X) 12 | return c*X 13 | 14 | np.random.seed(0) 15 | 16 | # Sample creation 17 | dim = 1 # Dimension 18 | n_sample = 200 19 | xmin, xmax = 0., 5. 20 | X = np.linspace(xmin, xmax, n_sample).reshape((n_sample, 1)) 21 | y = sin_func(X).ravel() + np.random.randn(n_sample) 22 | 23 | quantForest = QuantileForest().fit(X, y) 24 | 25 | n_quantiles = 10 26 | alpha = 0.9 27 | x = np.linspace(xmin, xmax, n_quantiles) 28 | x = 3. 29 | quantiles = quantForest.computeQuantile(x, alpha, do_optim=True) 30 | print quantiles 31 | 32 | x = np.linspace(xmin, xmax, n_quantiles) 33 | y_cdf = np.linspace(0., 30., 50) 34 | CDFs = quantForest.compute_CDF(x, y_cdf) 35 | print CDFs.shape 36 | 37 | if dim == 1: 38 | plt.ion() 39 | fig, ax = plt.subplots() 40 | ax.plot(X, y, '.k') 41 | ax.plot(x, quantiles, 'ob') 42 | fig.tight_layout() 43 | plt.show() -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """A package based on sklearn Random Forest to compute 3 | conditional quantiles. 4 | 5 | See: 6 | https://github.com/NazBen/pyquantregForest 7 | """ 8 | 9 | # Always prefer setuptools over distutils 10 | from setuptools import setup, find_packages 11 | # To use a consistent encoding 12 | from codecs import open 13 | from os import path 14 | 15 | here = path.abspath(path.dirname(__file__)) 16 | 17 | # Get the long description from the README file 18 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 19 | long_description = f.read() 20 | 21 | setup( 22 | name='pyquantregForest', 23 | 24 | version='0.1', 25 | 26 | description='A sample Python project', 27 | #long_description=long_description, 28 | 29 | url='https://github.com/NazBen/pyquantregForest', 30 | 31 | author='Nazih Benoumechiara', 32 | author_email='nazih.benoumechiara@gmail.com', 33 | 34 | license='MIT', 35 | 36 | keywords='sklearn randomforest quantile', 37 | 38 | packages=['pyquantregForest'], # Python packages to install 39 | # (If we have individual .py modules we can use the py_module argument instead) 40 | # This is the full name of the script "simcluster"; this will be installed to a 41 | # bin/ directory 42 | 43 | install_requires=['numpy', 'scipy', 'pandas', 'matplotlib', 'scikit-learn'], 44 | ) --------------------------------------------------------------------------------