├── README.md └── my_gcForest.ipynb /README.md: -------------------------------------------------------------------------------- 1 | ###Purpose 2 | This is for everyone to have a taste of gcForest before Professor Zhou release their code. 3 | 4 | ###Some things you may want to know 5 | Due to my computer's performance, I limited training set size and scaner size in the code. 6 | 7 | I try to program MultiGrainedScaner and CascadeForest exactly the same way described in Professor Zhou's [paper](https://arxiv.org/abs/1702.08835). But in fact, I used CV prediction (or OOB prediction) instead of averaged CV results.That may make some differences. 8 | 9 | ###Thanks 10 | Thanks to Professor Zhou's deep forest model. It really performs excellently on small-scale data. Looking forward to its performance on large-scale data and future expansion. 11 | 12 | Thanks to everyone who saw this code. I will be very honored if it can help you a little bit. -------------------------------------------------------------------------------- /my_gcForest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "from sklearn.ensemble import RandomForestClassifier\n", 13 | "from sklearn.cross_validation import cross_val_predict as cvp\n", 14 | "import random\n", 15 | "from functools import reduce" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "class MultiGrainedScaner():\n", 27 | " def __init__(self, base_estimator, params_list, sliding_ratio = 0.25, k_fold = 3):\n", 28 | " if k_fold > 1: #use cv\n", 29 | " self.params_list = params_list\n", 30 | " else:#use oob\n", 31 | " self.params_list = [params.update({'oob_score':True}) or params for params in params_list]\n", 32 | " self.sliding_ratio = sliding_ratio\n", 33 | " self.k_fold = k_fold\n", 34 | " self.base_estimator = base_estimator\n", 35 | " klass = self.base_estimator.__class__\n", 36 | " self.estimators = [klass(**params) for params in self.params_list]\n", 37 | "\n", 38 | " #generate scaned samples, X is not None, X[0] is no more than 3d\n", 39 | " def _sample_slicer(self,X,y):\n", 40 | " data_shape = X[0].shape\n", 41 | " window_shape = [max(int(data_size * self.sliding_ratio),1) for data_size in data_shape]\n", 42 | " scan_round_axis = [data_shape[i]-window_shape[i]+1 for i in range(len(data_shape))]\n", 43 | " scan_round_total = reduce(lambda acc,x: acc*x,scan_round_axis)\n", 44 | " if len(data_shape) == 1:\n", 45 | " newX = np.array([x[beg:beg+window_shape[0]]\n", 46 | " for x in X\n", 47 | " for beg in range(scan_round_axis[0])])\n", 48 | " elif len(data_shape) == 2:\n", 49 | " newX = np.array([x[beg0:beg0+window_shape[0],beg1:beg1+window_shape[1]].ravel()\n", 50 | " for x in X\n", 51 | " for beg0 in range(scan_round_axis[0])\n", 52 | " for beg1 in range(scan_round_axis[1])])\n", 53 | " elif len(data_shape) == 3:\n", 54 | " newX = np.array([x[beg0:beg0+window_shape[0],beg1:beg1+window_shape[1],beg2:beg2+window_shape[2]].ravel()\n", 55 | " for x in X\n", 56 | " for beg0 in range(scan_round_axis[0])\n", 57 | " for beg1 in range(scan_round_axis[1])\n", 58 | " for beg2 in range(scan_round_axis[2])])\n", 59 | " newy = y.repeat(scan_round_total)\n", 60 | " return newX,newy,scan_round_total\n", 61 | "\n", 62 | " #generate new sample vectors\n", 63 | " def scan_fit(self,X,y):\n", 64 | " self.n_classes = len(np.unique(y))\n", 65 | " newX,newy,scan_round_total = self._sample_slicer(X,y)\n", 66 | " sample_vector_list = []\n", 67 | " for estimator in self.estimators:\n", 68 | " estimator.fit(newX, newy)\n", 69 | " if self.k_fold > 1:# use cv\n", 70 | " predict_ = cvp(estimator, newX, newy, cv=self.k_fold, n_jobs = -1)\n", 71 | " else:#use oob\n", 72 | " predict_ = estimator.oob_decision_function_\n", 73 | " #fill default value if meet nan\n", 74 | " inds = np.where(np.isnan(predict_))\n", 75 | " predict_[inds] = 1./self.n_classes\n", 76 | " sample_vector = predict_.reshape((len(X),scan_round_total*self.n_classes))\n", 77 | " sample_vector_list.append(sample_vector)\n", 78 | " return np.hstack(sample_vector_list)\n", 79 | "\n", 80 | " def scan_predict(self,X):\n", 81 | " newX,newy,scan_round_total = self._sample_slicer(X,np.zeros(len(X)))\n", 82 | " sample_vector_list = []\n", 83 | " for estimator in self.estimators:\n", 84 | " predict_ = estimator.predict(newX)\n", 85 | " sample_vector = predict_.reshape((len(X),scan_round_total*self.n_classes))\n", 86 | " sample_vector_list.append(sample_vector)\n", 87 | " return np.hstack(sample_vector_list)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "class CascadeForest():\n", 99 | " def __init__(self, base_estimator, params_list, k_fold = 3, evaluate = lambda pre,y: float(sum(pre==y))/len(y)):\n", 100 | " if k_fold > 1: #use cv\n", 101 | " self.params_list = params_list\n", 102 | " else:#use oob\n", 103 | " self.params_list = [params.update({'oob_score':True}) or params for params in params_list]\n", 104 | " self.k_fold = k_fold\n", 105 | " self.evaluate = evaluate\n", 106 | " self.base_estimator = base_estimator\n", 107 | "# base_class = base_estimator.__class__\n", 108 | "# global prob_class\n", 109 | "# class prob_class(base_class): #to use cross_val_predict, estimator's predict method should be predict_prob\n", 110 | "# def predict(self, X):\n", 111 | "# return base_class.predict_proba(self, X)\n", 112 | "# self.base_estimator = prob_class()\n", 113 | "\n", 114 | " def fit(self,X_train,y_train):\n", 115 | " self.n_classes = len(np.unique(y_train))\n", 116 | " self.estimators_levels = []\n", 117 | " klass = self.base_estimator.__class__\n", 118 | " predictions_levels = []\n", 119 | " self.classes = np.unique(y_train)\n", 120 | "\n", 121 | " #first level\n", 122 | " estimators = [klass(**params) for params in self.params_list]\n", 123 | " self.estimators_levels.append(estimators)\n", 124 | " predictions = []\n", 125 | " for estimator in estimators:\n", 126 | " estimator.fit(X_train, y_train)\n", 127 | " if self.k_fold > 1:# use cv\n", 128 | " predict_ = cvp(estimator, X_train, y_train, cv=self.k_fold, n_jobs = -1)\n", 129 | " else:#use oob\n", 130 | " predict_ = estimator.oob_decision_function_\n", 131 | " #fill default value if meet nan\n", 132 | " inds = np.where(np.isnan(predict_))\n", 133 | " predict_[inds] = 1./self.n_classes\n", 134 | " predictions.append(predict_)\n", 135 | " attr_to_next_level = np.hstack(predictions)\n", 136 | " y_pre = self.classes.take(np.argmax(np.array(predictions).mean(axis=0),axis=1),axis=0)\n", 137 | " self.max_accuracy = self.evaluate(y_pre,y_train)\n", 138 | "\n", 139 | " #cascade step\n", 140 | " while True:\n", 141 | " print 'level {}, CV accuracy: {}'.format(len(self.estimators_levels),self.max_accuracy)\n", 142 | " estimators = [klass(**params) for params in self.params_list]\n", 143 | " self.estimators_levels.append(estimators)\n", 144 | " predictions = []\n", 145 | " X_train_step = np.hstack((attr_to_next_level,X_train))\n", 146 | " for estimator in estimators:\n", 147 | " estimator.fit(X_train_step, y_train)\n", 148 | " if self.k_fold > 1:# use cv\n", 149 | " predict_ = cvp(estimator, X_train_step, y_train, cv=self.k_fold, n_jobs = -1)\n", 150 | " else:#use oob\n", 151 | " predict_ = estimator.oob_decision_function_\n", 152 | " #fill default value if meet nan\n", 153 | " inds = np.where(np.isnan(predict_))\n", 154 | " predict_[inds] = 1./self.n_classes\n", 155 | " predictions.append(predict_)\n", 156 | " attr_to_next_level = np.hstack(predictions)\n", 157 | " y_pre = self.classes.take(np.argmax(np.array(predictions).mean(axis=0),axis=1),axis=0)\n", 158 | " accuracy = self.evaluate(y_pre,y_train)\n", 159 | " if accuracy > self.max_accuracy:\n", 160 | " self.max_accuracy = accuracy\n", 161 | " else:\n", 162 | " self.estimators_levels.pop()\n", 163 | " break\n", 164 | "\n", 165 | " def predict_proba_staged(self,X):\n", 166 | " #init ouput, shape = nlevel * nsample * nclass\n", 167 | " self.proba_staged = np.zeros((len(self.estimators_levels),len(X),self.n_classes))\n", 168 | "\n", 169 | " #first level\n", 170 | " estimators = self.estimators_levels[0]\n", 171 | " predictions = []\n", 172 | " for estimator in estimators:\n", 173 | " predict_ = estimator.predict(X)\n", 174 | " predictions.append(predict_)\n", 175 | " attr_to_next_level = np.hstack(predictions)\n", 176 | " self.proba_staged[0] = np.array(predictions).mean(axis=0) #不同estimator求平均\n", 177 | "\n", 178 | " #cascade step\n", 179 | " for i in range(1,len(self.estimators_levels)):\n", 180 | " estimators = self.estimators_levels[i]\n", 181 | " predictions = []\n", 182 | " X_step = np.hstack((attr_to_next_level,X))\n", 183 | " for estimator in estimators:\n", 184 | " predict_ = estimator.predict(X_step)\n", 185 | " predictions.append(predict_)\n", 186 | " attr_to_next_level = np.hstack(predictions)\n", 187 | " self.proba_staged[i] = np.array(predictions).mean(axis=0)\n", 188 | "\n", 189 | " return self.proba_staged\n", 190 | " \n", 191 | " def predict_proba(self,X):\n", 192 | " return self.predict_proba_staged(X)[-1]\n", 193 | " \n", 194 | " def predict_staged(self,X):\n", 195 | " proba_staged = self.predict_proba_staged(X)\n", 196 | " predictions_staged = np.apply_along_axis(lambda proba: self.classes.take(np.argmax(proba),axis=0),\n", 197 | " 2, \n", 198 | " proba_staged)\n", 199 | " return predictions_staged\n", 200 | "\n", 201 | " def predict(self,X):\n", 202 | " proba = self.predict_proba(X)\n", 203 | " predictions = self.classes.take(np.argmax(proba,axis=1),axis=0) #平均值最大的index对应的class\n", 204 | " return predictions" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 4, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "from sklearn.datasets import fetch_mldata\n", 225 | "mnist = fetch_mldata('MNIST original')\n", 226 | "\n", 227 | "# Trunk the data\n", 228 | "n_train = 60000\n", 229 | "n_test = 10000\n", 230 | "\n", 231 | "# Define training and testing sets\n", 232 | "train_idx = np.arange(n_train)\n", 233 | "test_idx = np.arange(n_test)+n_train\n", 234 | "random.shuffle(train_idx)\n", 235 | "\n", 236 | "X_train, y_train = mnist.data[train_idx], mnist.target[train_idx]\n", 237 | "X_test, y_test = mnist.data[test_idx], mnist.target[test_idx]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 5, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "scan_forest_params1 = RandomForestClassifier(n_estimators=30,min_samples_split=21,max_features=1,n_jobs=-1).get_params()\n", 258 | "scan_forest_params2 = RandomForestClassifier(n_estimators=30,min_samples_split=21,max_features='sqrt',n_jobs=-1).get_params()\n", 259 | "\n", 260 | "cascade_forest_params1 = RandomForestClassifier(n_estimators=1000,min_samples_split=11,max_features=1,n_jobs=-1).get_params()\n", 261 | "cascade_forest_params2 = RandomForestClassifier(n_estimators=1000,min_samples_split=11,max_features='sqrt',n_jobs=-1).get_params()\n", 262 | "\n", 263 | "scan_params_list = [scan_forest_params1,scan_forest_params2]\n", 264 | "cascade_params_list = [cascade_forest_params1,cascade_forest_params2]*2" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 6, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "def calc_accuracy(pre,y):\n", 276 | " return float(sum(pre==y))/len(y)\n", 277 | "class ProbRandomForestClassifier(RandomForestClassifier):\n", 278 | " def predict(self, X):\n", 279 | " return RandomForestClassifier.predict_proba(self, X)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 7, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "train_size = 1000" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 8, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "level 1, CV accuracy: 0.958\n", 305 | "level 2, CV accuracy: 0.961\n", 306 | "level 1, test accuracy: 0.9592\n", 307 | "level 2, test accuracy: 0.9612\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "# gcForest \n", 313 | "\n", 314 | "# Multi-Grained Scan Step\n", 315 | "Scaner1 = MultiGrainedScaner(ProbRandomForestClassifier(), scan_params_list, sliding_ratio = 1./4)\n", 316 | "Scaner2 = MultiGrainedScaner(ProbRandomForestClassifier(), scan_params_list, sliding_ratio = 1./9)\n", 317 | "Scaner3 = MultiGrainedScaner(ProbRandomForestClassifier(), scan_params_list, sliding_ratio = 1./16)\n", 318 | "\n", 319 | "X_train_scan =np.hstack([scaner.scan_fit(X_train[:train_size].reshape((train_size,28,28)), y_train[:train_size])\n", 320 | " for scaner in [Scaner1,Scaner2,Scaner3][:1]])\n", 321 | "X_test_scan = np.hstack([scaner.scan_predict(X_test.reshape((len(X_test),28,28)))\n", 322 | " for scaner in [Scaner1,Scaner2,Scaner3][:1]])\n", 323 | "\n", 324 | "# Cascade RandomForest Step\n", 325 | "CascadeRF = CascadeForest(ProbRandomForestClassifier(),cascade_params_list)\n", 326 | "CascadeRF.fit(X_train_scan, y_train[:train_size])\n", 327 | "y_pre_staged = CascadeRF.predict_staged(X_test_scan)\n", 328 | "test_accuracy_staged = np.apply_along_axis(lambda y_pre: calc_accuracy(y_pre,y_test), 1, y_pre_staged)\n", 329 | "print '\\n'.join('level {}, test accuracy: {}'.format(i+1,test_accuracy_staged[i]) for i in xrange(len(test_accuracy_staged)))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 9, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [ 339 | { 340 | "name": "stdout", 341 | "output_type": "stream", 342 | "text": [ 343 | "level 1, CV accuracy: 0.884\n", 344 | "level 2, CV accuracy: 0.889\n", 345 | "level 1, test accuracy: 0.899\n", 346 | "level 2, test accuracy: 0.9116\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "# CascadeRF baseline\n", 352 | "BaseCascadeRF = CascadeForest(ProbRandomForestClassifier(),cascade_params_list,k_fold=3)\n", 353 | "BaseCascadeRF.fit(X_train[:train_size], y_train[:train_size])\n", 354 | "y_pre_staged = BaseCascadeRF.predict_staged(X_test)\n", 355 | "test_accuracy_staged = np.apply_along_axis(lambda y_pre: calc_accuracy(y_pre,y_test), 1, y_pre_staged)\n", 356 | "print '\\n'.join('level {}, test accuracy: {}'.format(i+1,test_accuracy_staged[i]) for i in xrange(len(test_accuracy_staged)))" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 10, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "0.904\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "# RF baseline\n", 376 | "RF = RandomForestClassifier(n_estimators=1000)\n", 377 | "RF.fit(X_train[:train_size], y_train[:train_size])\n", 378 | "y_pre = RF.predict(X_test)\n", 379 | "print calc_accuracy(y_pre,y_test)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "collapsed": false 387 | }, 388 | "outputs": [], 389 | "source": [] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [], 398 | "source": [] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "collapsed": false 405 | }, 406 | "outputs": [], 407 | "source": [] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [], 416 | "source": [] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "collapsed": true 423 | }, 424 | "outputs": [], 425 | "source": [] 426 | } 427 | ], 428 | "metadata": { 429 | "kernelspec": { 430 | "display_name": "Python 2", 431 | "language": "python", 432 | "name": "python2" 433 | }, 434 | "language_info": { 435 | "codemirror_mode": { 436 | "name": "ipython", 437 | "version": 2 438 | }, 439 | "file_extension": ".py", 440 | "mimetype": "text/x-python", 441 | "name": "python", 442 | "nbconvert_exporter": "python", 443 | "pygments_lexer": "ipython2", 444 | "version": "2.7.10" 445 | } 446 | }, 447 | "nbformat": 4, 448 | "nbformat_minor": 0 449 | } 450 | --------------------------------------------------------------------------------