├── README.md
└── my_gcForest.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | ###Purpose
 2 | This is for everyone to have a taste of gcForest before Professor Zhou release their code.
 3 | 
 4 | ###Some things you may want to know
 5 | Due to my computer's performance, I limited training set size and scaner size in the code.
 6 | 
 7 | I try to program MultiGrainedScaner and CascadeForest exactly the same way described in Professor Zhou's [paper](https://arxiv.org/abs/1702.08835). But in fact, I used CV prediction (or OOB prediction) instead of averaged CV results.That may make some differences.
 8 | 
 9 | ###Thanks
10 | Thanks to Professor Zhou's deep forest model. It really performs excellently on small-scale data. Looking forward to its performance on large-scale data and future expansion.
11 | 
12 | Thanks to everyone who saw this code. I will be very honored if it can help you a little bit.


--------------------------------------------------------------------------------
/my_gcForest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "from sklearn.ensemble import RandomForestClassifier\n",
 13 |     "from sklearn.cross_validation import cross_val_predict as cvp\n",
 14 |     "import random\n",
 15 |     "from functools import reduce"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "class MultiGrainedScaner():\n",
 27 |     "    def __init__(self, base_estimator, params_list, sliding_ratio = 0.25, k_fold = 3):\n",
 28 |     "        if k_fold > 1: #use cv\n",
 29 |     "            self.params_list = params_list\n",
 30 |     "        else:#use oob\n",
 31 |     "            self.params_list = [params.update({'oob_score':True}) or params for params in params_list]\n",
 32 |     "        self.sliding_ratio = sliding_ratio\n",
 33 |     "        self.k_fold = k_fold\n",
 34 |     "        self.base_estimator = base_estimator\n",
 35 |     "        klass = self.base_estimator.__class__\n",
 36 |     "        self.estimators = [klass(**params) for params in self.params_list]\n",
 37 |     "\n",
 38 |     "    #generate scaned samples, X is not None, X[0] is no more than 3d\n",
 39 |     "    def _sample_slicer(self,X,y):\n",
 40 |     "        data_shape = X[0].shape\n",
 41 |     "        window_shape = [max(int(data_size * self.sliding_ratio),1) for data_size in data_shape]\n",
 42 |     "        scan_round_axis = [data_shape[i]-window_shape[i]+1 for i in range(len(data_shape))]\n",
 43 |     "        scan_round_total = reduce(lambda acc,x: acc*x,scan_round_axis)\n",
 44 |     "        if len(data_shape) == 1:\n",
 45 |     "            newX = np.array([x[beg:beg+window_shape[0]]\n",
 46 |     "                                for x in X\n",
 47 |     "                                    for beg in range(scan_round_axis[0])])\n",
 48 |     "        elif len(data_shape) == 2:\n",
 49 |     "            newX = np.array([x[beg0:beg0+window_shape[0],beg1:beg1+window_shape[1]].ravel()\n",
 50 |     "                                for x in X\n",
 51 |     "                                    for beg0 in range(scan_round_axis[0])\n",
 52 |     "                                        for beg1 in range(scan_round_axis[1])])\n",
 53 |     "        elif len(data_shape) == 3:\n",
 54 |     "            newX = np.array([x[beg0:beg0+window_shape[0],beg1:beg1+window_shape[1],beg2:beg2+window_shape[2]].ravel()\n",
 55 |     "                                for x in X\n",
 56 |     "                                    for beg0 in range(scan_round_axis[0])\n",
 57 |     "                                        for beg1 in range(scan_round_axis[1])\n",
 58 |     "                                            for beg2 in range(scan_round_axis[2])])\n",
 59 |     "        newy = y.repeat(scan_round_total)\n",
 60 |     "        return newX,newy,scan_round_total\n",
 61 |     "\n",
 62 |     "    #generate new sample vectors\n",
 63 |     "    def scan_fit(self,X,y):\n",
 64 |     "        self.n_classes = len(np.unique(y))\n",
 65 |     "        newX,newy,scan_round_total = self._sample_slicer(X,y)\n",
 66 |     "        sample_vector_list = []\n",
 67 |     "        for estimator in self.estimators:\n",
 68 |     "            estimator.fit(newX, newy)\n",
 69 |     "            if self.k_fold > 1:# use cv\n",
 70 |     "                predict_ = cvp(estimator, newX, newy, cv=self.k_fold, n_jobs = -1)\n",
 71 |     "            else:#use oob\n",
 72 |     "                predict_ = estimator.oob_decision_function_\n",
 73 |     "                #fill default value if meet nan\n",
 74 |     "                inds = np.where(np.isnan(predict_))\n",
 75 |     "                predict_[inds] = 1./self.n_classes\n",
 76 |     "            sample_vector = predict_.reshape((len(X),scan_round_total*self.n_classes))\n",
 77 |     "            sample_vector_list.append(sample_vector)\n",
 78 |     "        return np.hstack(sample_vector_list)\n",
 79 |     "\n",
 80 |     "    def scan_predict(self,X):\n",
 81 |     "        newX,newy,scan_round_total = self._sample_slicer(X,np.zeros(len(X)))\n",
 82 |     "        sample_vector_list = []\n",
 83 |     "        for estimator in self.estimators:\n",
 84 |     "            predict_ = estimator.predict(newX)\n",
 85 |     "            sample_vector = predict_.reshape((len(X),scan_round_total*self.n_classes))\n",
 86 |     "            sample_vector_list.append(sample_vector)\n",
 87 |     "        return np.hstack(sample_vector_list)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 3,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "class CascadeForest():\n",
 99 |     "    def __init__(self, base_estimator, params_list, k_fold = 3, evaluate = lambda pre,y: float(sum(pre==y))/len(y)):\n",
100 |     "        if k_fold > 1: #use cv\n",
101 |     "            self.params_list = params_list\n",
102 |     "        else:#use oob\n",
103 |     "            self.params_list = [params.update({'oob_score':True}) or params for params in params_list]\n",
104 |     "        self.k_fold = k_fold\n",
105 |     "        self.evaluate = evaluate\n",
106 |     "        self.base_estimator = base_estimator\n",
107 |     "#         base_class = base_estimator.__class__\n",
108 |     "#         global prob_class\n",
109 |     "#         class prob_class(base_class): #to use cross_val_predict, estimator's predict method should be predict_prob\n",
110 |     "#             def predict(self, X):\n",
111 |     "#                 return base_class.predict_proba(self, X)\n",
112 |     "#         self.base_estimator = prob_class()\n",
113 |     "\n",
114 |     "    def fit(self,X_train,y_train):\n",
115 |     "        self.n_classes = len(np.unique(y_train))\n",
116 |     "        self.estimators_levels = []\n",
117 |     "        klass = self.base_estimator.__class__\n",
118 |     "        predictions_levels = []\n",
119 |     "        self.classes = np.unique(y_train)\n",
120 |     "\n",
121 |     "        #first level\n",
122 |     "        estimators = [klass(**params) for params in self.params_list]\n",
123 |     "        self.estimators_levels.append(estimators)\n",
124 |     "        predictions = []\n",
125 |     "        for estimator in estimators:\n",
126 |     "            estimator.fit(X_train, y_train)\n",
127 |     "            if self.k_fold > 1:# use cv\n",
128 |     "                predict_ = cvp(estimator, X_train, y_train, cv=self.k_fold, n_jobs = -1)\n",
129 |     "            else:#use oob\n",
130 |     "                predict_ = estimator.oob_decision_function_\n",
131 |     "                #fill default value if meet nan\n",
132 |     "                inds = np.where(np.isnan(predict_))\n",
133 |     "                predict_[inds] = 1./self.n_classes\n",
134 |     "            predictions.append(predict_)\n",
135 |     "        attr_to_next_level = np.hstack(predictions)\n",
136 |     "        y_pre = self.classes.take(np.argmax(np.array(predictions).mean(axis=0),axis=1),axis=0)\n",
137 |     "        self.max_accuracy = self.evaluate(y_pre,y_train)\n",
138 |     "\n",
139 |     "        #cascade step\n",
140 |     "        while True:\n",
141 |     "            print 'level {}, CV accuracy: {}'.format(len(self.estimators_levels),self.max_accuracy)\n",
142 |     "            estimators = [klass(**params) for params in self.params_list]\n",
143 |     "            self.estimators_levels.append(estimators)\n",
144 |     "            predictions = []\n",
145 |     "            X_train_step = np.hstack((attr_to_next_level,X_train))\n",
146 |     "            for estimator in estimators:\n",
147 |     "                estimator.fit(X_train_step, y_train)\n",
148 |     "                if self.k_fold > 1:# use cv\n",
149 |     "                    predict_ = cvp(estimator, X_train_step, y_train, cv=self.k_fold, n_jobs = -1)\n",
150 |     "                else:#use oob\n",
151 |     "                    predict_ = estimator.oob_decision_function_\n",
152 |     "                    #fill default value if meet nan\n",
153 |     "                    inds = np.where(np.isnan(predict_))\n",
154 |     "                    predict_[inds] = 1./self.n_classes\n",
155 |     "                predictions.append(predict_)\n",
156 |     "            attr_to_next_level = np.hstack(predictions)\n",
157 |     "            y_pre = self.classes.take(np.argmax(np.array(predictions).mean(axis=0),axis=1),axis=0)\n",
158 |     "            accuracy = self.evaluate(y_pre,y_train)\n",
159 |     "            if accuracy > self.max_accuracy:\n",
160 |     "                self.max_accuracy = accuracy\n",
161 |     "            else:\n",
162 |     "                self.estimators_levels.pop()\n",
163 |     "                break\n",
164 |     "\n",
165 |     "    def predict_proba_staged(self,X):\n",
166 |     "        #init ouput, shape = nlevel * nsample * nclass\n",
167 |     "        self.proba_staged = np.zeros((len(self.estimators_levels),len(X),self.n_classes))\n",
168 |     "\n",
169 |     "        #first level\n",
170 |     "        estimators = self.estimators_levels[0]\n",
171 |     "        predictions = []\n",
172 |     "        for estimator in estimators:\n",
173 |     "            predict_ = estimator.predict(X)\n",
174 |     "            predictions.append(predict_)\n",
175 |     "        attr_to_next_level = np.hstack(predictions)\n",
176 |     "        self.proba_staged[0] = np.array(predictions).mean(axis=0) #不同estimator求平均\n",
177 |     "\n",
178 |     "        #cascade step\n",
179 |     "        for i in range(1,len(self.estimators_levels)):\n",
180 |     "            estimators = self.estimators_levels[i]\n",
181 |     "            predictions = []\n",
182 |     "            X_step = np.hstack((attr_to_next_level,X))\n",
183 |     "            for estimator in estimators:\n",
184 |     "                predict_ = estimator.predict(X_step)\n",
185 |     "                predictions.append(predict_)\n",
186 |     "            attr_to_next_level = np.hstack(predictions)\n",
187 |     "            self.proba_staged[i] = np.array(predictions).mean(axis=0)\n",
188 |     "\n",
189 |     "        return self.proba_staged\n",
190 |     "    \n",
191 |     "    def predict_proba(self,X):\n",
192 |     "        return self.predict_proba_staged(X)[-1]\n",
193 |     "    \n",
194 |     "    def predict_staged(self,X):\n",
195 |     "        proba_staged = self.predict_proba_staged(X)\n",
196 |     "        predictions_staged = np.apply_along_axis(lambda proba: self.classes.take(np.argmax(proba),axis=0),\n",
197 |     "                                                 2, \n",
198 |     "                                                 proba_staged)\n",
199 |     "        return predictions_staged\n",
200 |     "\n",
201 |     "    def predict(self,X):\n",
202 |     "        proba = self.predict_proba(X)\n",
203 |     "        predictions = self.classes.take(np.argmax(proba,axis=1),axis=0) #平均值最大的index对应的class\n",
204 |     "        return predictions"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": false
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 4,
219 |    "metadata": {
220 |     "collapsed": false
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "from sklearn.datasets import fetch_mldata\n",
225 |     "mnist = fetch_mldata('MNIST original')\n",
226 |     "\n",
227 |     "# Trunk the data\n",
228 |     "n_train = 60000\n",
229 |     "n_test = 10000\n",
230 |     "\n",
231 |     "# Define training and testing sets\n",
232 |     "train_idx = np.arange(n_train)\n",
233 |     "test_idx = np.arange(n_test)+n_train\n",
234 |     "random.shuffle(train_idx)\n",
235 |     "\n",
236 |     "X_train, y_train = mnist.data[train_idx], mnist.target[train_idx]\n",
237 |     "X_test, y_test = mnist.data[test_idx], mnist.target[test_idx]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [],
247 |    "source": []
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 5,
252 |    "metadata": {
253 |     "collapsed": false
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "scan_forest_params1 = RandomForestClassifier(n_estimators=30,min_samples_split=21,max_features=1,n_jobs=-1).get_params()\n",
258 |     "scan_forest_params2 = RandomForestClassifier(n_estimators=30,min_samples_split=21,max_features='sqrt',n_jobs=-1).get_params()\n",
259 |     "\n",
260 |     "cascade_forest_params1 = RandomForestClassifier(n_estimators=1000,min_samples_split=11,max_features=1,n_jobs=-1).get_params()\n",
261 |     "cascade_forest_params2 = RandomForestClassifier(n_estimators=1000,min_samples_split=11,max_features='sqrt',n_jobs=-1).get_params()\n",
262 |     "\n",
263 |     "scan_params_list = [scan_forest_params1,scan_forest_params2]\n",
264 |     "cascade_params_list = [cascade_forest_params1,cascade_forest_params2]*2"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 6,
270 |    "metadata": {
271 |     "collapsed": false
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "def calc_accuracy(pre,y):\n",
276 |     "    return float(sum(pre==y))/len(y)\n",
277 |     "class ProbRandomForestClassifier(RandomForestClassifier):\n",
278 |     "    def predict(self, X):\n",
279 |     "        return RandomForestClassifier.predict_proba(self, X)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 7,
285 |    "metadata": {
286 |     "collapsed": true
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "train_size = 1000"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 8,
296 |    "metadata": {
297 |     "collapsed": false
298 |    },
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "level 1, CV accuracy: 0.958\n",
305 |       "level 2, CV accuracy: 0.961\n",
306 |       "level 1, test accuracy: 0.9592\n",
307 |       "level 2, test accuracy: 0.9612\n"
308 |      ]
309 |     }
310 |    ],
311 |    "source": [
312 |     "# gcForest \n",
313 |     "\n",
314 |     "# Multi-Grained Scan Step\n",
315 |     "Scaner1 = MultiGrainedScaner(ProbRandomForestClassifier(), scan_params_list, sliding_ratio = 1./4)\n",
316 |     "Scaner2 = MultiGrainedScaner(ProbRandomForestClassifier(), scan_params_list, sliding_ratio = 1./9)\n",
317 |     "Scaner3 = MultiGrainedScaner(ProbRandomForestClassifier(), scan_params_list, sliding_ratio = 1./16)\n",
318 |     "\n",
319 |     "X_train_scan =np.hstack([scaner.scan_fit(X_train[:train_size].reshape((train_size,28,28)), y_train[:train_size])\n",
320 |     "                             for scaner in [Scaner1,Scaner2,Scaner3][:1]])\n",
321 |     "X_test_scan = np.hstack([scaner.scan_predict(X_test.reshape((len(X_test),28,28)))\n",
322 |     "                             for scaner in [Scaner1,Scaner2,Scaner3][:1]])\n",
323 |     "\n",
324 |     "# Cascade RandomForest Step\n",
325 |     "CascadeRF = CascadeForest(ProbRandomForestClassifier(),cascade_params_list)\n",
326 |     "CascadeRF.fit(X_train_scan, y_train[:train_size])\n",
327 |     "y_pre_staged = CascadeRF.predict_staged(X_test_scan)\n",
328 |     "test_accuracy_staged = np.apply_along_axis(lambda y_pre: calc_accuracy(y_pre,y_test), 1, y_pre_staged)\n",
329 |     "print '\\n'.join('level {}, test accuracy: {}'.format(i+1,test_accuracy_staged[i]) for i in xrange(len(test_accuracy_staged)))"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 9,
335 |    "metadata": {
336 |     "collapsed": false
337 |    },
338 |    "outputs": [
339 |     {
340 |      "name": "stdout",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "level 1, CV accuracy: 0.884\n",
344 |       "level 2, CV accuracy: 0.889\n",
345 |       "level 1, test accuracy: 0.899\n",
346 |       "level 2, test accuracy: 0.9116\n"
347 |      ]
348 |     }
349 |    ],
350 |    "source": [
351 |     "# CascadeRF baseline\n",
352 |     "BaseCascadeRF = CascadeForest(ProbRandomForestClassifier(),cascade_params_list,k_fold=3)\n",
353 |     "BaseCascadeRF.fit(X_train[:train_size], y_train[:train_size])\n",
354 |     "y_pre_staged = BaseCascadeRF.predict_staged(X_test)\n",
355 |     "test_accuracy_staged = np.apply_along_axis(lambda y_pre: calc_accuracy(y_pre,y_test), 1, y_pre_staged)\n",
356 |     "print '\\n'.join('level {}, test accuracy: {}'.format(i+1,test_accuracy_staged[i]) for i in xrange(len(test_accuracy_staged)))"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 10,
362 |    "metadata": {
363 |     "collapsed": false
364 |    },
365 |    "outputs": [
366 |     {
367 |      "name": "stdout",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "0.904\n"
371 |      ]
372 |     }
373 |    ],
374 |    "source": [
375 |     "# RF baseline\n",
376 |     "RF = RandomForestClassifier(n_estimators=1000)\n",
377 |     "RF.fit(X_train[:train_size], y_train[:train_size])\n",
378 |     "y_pre = RF.predict(X_test)\n",
379 |     "print calc_accuracy(y_pre,y_test)"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {
386 |     "collapsed": false
387 |    },
388 |    "outputs": [],
389 |    "source": []
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {
395 |     "collapsed": false
396 |    },
397 |    "outputs": [],
398 |    "source": []
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": false
405 |    },
406 |    "outputs": [],
407 |    "source": []
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {
413 |     "collapsed": false
414 |    },
415 |    "outputs": [],
416 |    "source": []
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {
422 |     "collapsed": true
423 |    },
424 |    "outputs": [],
425 |    "source": []
426 |   }
427 |  ],
428 |  "metadata": {
429 |   "kernelspec": {
430 |    "display_name": "Python 2",
431 |    "language": "python",
432 |    "name": "python2"
433 |   },
434 |   "language_info": {
435 |    "codemirror_mode": {
436 |     "name": "ipython",
437 |     "version": 2
438 |    },
439 |    "file_extension": ".py",
440 |    "mimetype": "text/x-python",
441 |    "name": "python",
442 |    "nbconvert_exporter": "python",
443 |    "pygments_lexer": "ipython2",
444 |    "version": "2.7.10"
445 |   }
446 |  },
447 |  "nbformat": 4,
448 |  "nbformat_minor": 0
449 | }
450 | 


--------------------------------------------------------------------------------