├── .gitignore ├── .travis.yml ├── LICENSE ├── Readme.rst ├── setup.py └── stacked_generalization ├── __init__.py ├── example ├── cross_validation_for_iris.py ├── fwls_regression.py ├── joblibed_classification.py ├── kaggle_titanic.py └── simple_regression.py └── lib ├── __init__.py ├── joblibed.py ├── stacking.py ├── test └── test.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.pyc 3 | build/ 4 | dist/ 5 | stacked_generalization.egg-info/ 6 | *.pkl 7 | pip_release.bat 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | - 3.5 5 | # Setup anaconda 6 | before_install: 7 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 8 | - chmod +x miniconda.sh 9 | - ./miniconda.sh -b 10 | - export PATH=/home/travis/miniconda2/bin:$PATH 11 | - conda update --yes conda 12 | # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda 13 | - sudo rm -rf /dev/shm 14 | - sudo ln -s /run/shm /dev/shm 15 | # Install packages 16 | install: 17 | # - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy scipy nose scikit-learn pandas 18 | - conda install --yes atlas numpy scipy nose scikit-learn pandas 19 | - python setup.py install 20 | script: 21 | - cd stacked_generalization/lib/test 22 | - python test.py 23 | - cd ../../example 24 | - python cross_validation_for_iris.py 25 | - python simple_regression.py 26 | - python joblibed_classification.py 27 | - python fwls_regression.py 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Readme.rst: -------------------------------------------------------------------------------- 1 | |Build Status| 2 | 3 | stacked\_generalization 4 | ======================= 5 | 6 | Implemented machine learning ***stacking technic[1]*** as handy library 7 | in Python. Feature weighted linear stacking is also available. (See 8 | https://github.com/fukatani/stacked\_generalization/tree/master/stacked\_generalization/example) 9 | 10 | Including simple model cache system Joblibed claasifier and Joblibed 11 | Regressor. 12 | 13 | Feature 14 | ------- 15 | 16 | 1) Any scikit-learn model is availavle for Stage 0 and Stage 1 model. 17 | ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' 18 | 19 | And stacked model itself has the same interface as scikit-learn library. 20 | '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' 21 | 22 | You can replace model such as *RandomForestClassifier* to *stacked 23 | model* easily in your scripts. And multi stage stacking is also easy. 24 | 25 | ex. 26 | 27 | .. code:: python 28 | 29 | from stacked_generalization.lib.stacking import StackedClassifier 30 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 31 | from sklearn.linear_model import LogisticRegression, RidgeClassifier 32 | from sklearn import datasets, metrics 33 | iris = datasets.load_iris() 34 | 35 | # Stage 1 model 36 | bclf = LogisticRegression(random_state=1) 37 | 38 | # Stage 0 models 39 | clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), 40 | GradientBoostingClassifier(n_estimators=25, random_state=1), 41 | RidgeClassifier(random_state=1)] 42 | 43 | # same interface as scikit-learn 44 | sl = StackedClassifier(bclf, clfs) 45 | sl.fit(iris.target, iris.data) 46 | score = metrics.accuracy_score(iris.target, sl.predict(iris.data)) 47 | print("Accuracy: %f" % score) 48 | 49 | More detail example is here. 50 | https://github.com/fukatani/stacked\_generalization/blob/master/stacked\_generalization/example/cross\_validation\_for\_iris.py 51 | 52 | https://github.com/fukatani/stacked\_generalization/blob/master/stacked\_generalization/example/simple\_regression.py 53 | 54 | 2) Evaluation model by out-of-bugs score. 55 | ''''''''''''''''''''''''''''''''''''''''' 56 | 57 | Stacking technic itself uses CV to stage0. So if you use CV for entire 58 | stacked model, ***each stage 0 model are fitted n\_folds squared 59 | times.*** Sometimes its computational cost can be significent, therefore 60 | we implemented CV only for stage1[2]. 61 | 62 | For example, when we get 3 blends (stage0 prediction), 2 blends are used 63 | for stage 1 fitting. The remaining one blend is used for model test. 64 | Repitation this cycle for all 3 blends, and averaging scores, we can get 65 | oob (out-of-bugs) score ***with only n\_fold times stage0 fitting.*** 66 | 67 | ex. 68 | 69 | .. code:: python 70 | 71 | sl = StackedClassifier(bclf, clfs, oob_score_flag=True) 72 | sl.fit(iris.data, iris.target) 73 | print("Accuracy: %f" % sl.oob_score_) 74 | 75 | 3) Caching stage1 blend\_data and trained model. (optional) 76 | ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' 77 | 78 | If cache is exists, recalculation for stage 0 will be skipped. This 79 | function is useful for stage 1 tuning. 80 | 81 | .. code:: python 82 | 83 | sl = StackedClassifier(bclf, clfs, save_stage0=True, save_dir='stack_temp') 84 | 85 | Feature of Joblibed Classifier / Regressor 86 | ------------------------------------------ 87 | 88 | Joblibed Classifier / Regressor is simple cache system for scikit-learn 89 | machine learning model. You can use it easily by minimum code 90 | modification. 91 | 92 | At first fitting and prediction, model calculation is performed 93 | normally. At the same time, model fitting result and prediction result 94 | are saved as *.pkl* and *.csv* respectively. 95 | 96 | **At second fitting and prediction, if cache is existence, model and 97 | prediction results will be loaded from cache and never recalculation.** 98 | 99 | e.g. 100 | 101 | .. code:: python 102 | 103 | from sklearn import datasets 104 | from sklearn.cross_validation import StratifiedKFold 105 | from sklearn.ensemble import RandomForestClassifier 106 | from stacked_generalization.lib.joblibed import JoblibedClassifier 107 | 108 | # Load iris 109 | iris = datasets.load_iris() 110 | 111 | # Declaration of Joblibed model 112 | rf = RandomForestClassifier(n_estimators=40) 113 | clf = JoblibedClassifier(rf, "rf") 114 | 115 | train_idx, test_idx = list(StratifiedKFold(iris.target, 3))[0] 116 | 117 | xs_train = iris.data[train_idx] 118 | y_train = iris.target[train_idx] 119 | xs_test = iris.data[test_idx] 120 | y_test = iris.target[test_idx] 121 | 122 | # Need to indicate sample for discriminating cache existence. 123 | clf.fit(xs_train, y_train, train_idx) 124 | score = clf.score(xs_test, y_test, test_idx) 125 | 126 | See also 127 | https://github.com/fukatani/stacked\_generalization/blob/master/stacked\_generalization/lib/joblibed.py 128 | 129 | Software Requirement 130 | -------------------- 131 | 132 | - Python (2.7 or 3.5 or later) 133 | - numpy 134 | - scikit-learn 135 | - pandas 136 | 137 | Installation 138 | ------------ 139 | 140 | :: 141 | 142 | pip install stacked_generalization 143 | 144 | License 145 | ------- 146 | 147 | MIT License. (http://opensource.org/licenses/mit-license.php) 148 | 149 | Copyright 150 | --------- 151 | 152 | Copyright (C) 2016, Ryosuke Fukatani 153 | 154 | Many part of the implementation of stacking is based on the following. 155 | Thanks! 156 | https://github.com/log0/vertebral/blob/master/stacked\_generalization.py 157 | 158 | Other 159 | ----- 160 | 161 | Any contributions (implement, documentation, test or idea...) are 162 | welcome. 163 | 164 | References 165 | ---------- 166 | 167 | [1] L. Breiman, "Stacked Regressions", Machine Learning, 24, 49-64 168 | (1996). [2] J. Sill1 et al, "Feature Weighted Linear Stacking", 169 | https://arxiv.org/abs/0911.0460, 2009. 170 | 171 | .. |Build Status| image:: https://travis-ci.org/fukatani/stacked_generalization.svg?branch=master 172 | :target: https://travis-ci.org/fukatani/stacked_generalization 173 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | from setuptools import setup, find_packages 4 | 5 | version = '0.0.6' 6 | 7 | install_requires = [ 8 | 'numpy', 9 | 'scikit-learn', 10 | 'pandas', 11 | ] 12 | 13 | CURRENT_DIR = os.path.abspath(os.path.dirname(__file__)) 14 | 15 | def read(filename): 16 | return io.open(os.path.join(CURRENT_DIR, filename), encoding='utf-8').read() 17 | 18 | setup(name='stacked_generalization', 19 | version=version, 20 | description='Machine Learning Stacking Util', 21 | keywords = 'Stacking, Machine Learning', 22 | author='Ryosuke Fukatani', 23 | author_email='nannyakannya@gmail.com', 24 | url='https://github.com/fukatani/stacked_generalization', 25 | license="Apache License 2.0", 26 | packages=find_packages(), 27 | package_data={ 'stacked_generalization' : ['Readme.md'], }, 28 | long_description='Readme.rst', 29 | install_requires=install_requires, 30 | ) 31 | 32 | -------------------------------------------------------------------------------- /stacked_generalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fukatani/stacked_generalization/bd1e7aa7f090e6902cfbd389f9cd6500d7389954/stacked_generalization/__init__.py -------------------------------------------------------------------------------- /stacked_generalization/example/cross_validation_for_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | from sklearn.utils.validation import check_random_state 3 | from stacked_generalization.lib.stacking import StackedClassifier 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.ensemble import ExtraTreesClassifier 6 | from sklearn.ensemble import GradientBoostingClassifier 7 | from sklearn.neighbors import KNeighborsClassifier 8 | from sklearn.linear_model import LogisticRegression, RidgeClassifier 9 | from sklearn.linear_model import Ridge 10 | from sklearn.model_selection import StratifiedKFold 11 | from sklearn.manifold import TSNE 12 | 13 | iris = datasets.load_iris() 14 | rng = check_random_state(0) 15 | perm = rng.permutation(iris.target.size) 16 | iris.data = iris.data[perm] 17 | iris.target = iris.target[perm] 18 | 19 | # Stage 1 model 20 | bclf = LogisticRegression(random_state=1) 21 | 22 | # Stage 0 models 23 | clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), 24 | ExtraTreesClassifier(n_estimators=30, criterion = 'gini', random_state=3), 25 | GradientBoostingClassifier(n_estimators=25, random_state=1), 26 | GradientBoostingClassifier(n_estimators=30, random_state=2), 27 | #GradientBoostingClassifier(n_estimators=30, random_state=3), 28 | KNeighborsClassifier(), 29 | RidgeClassifier(random_state=1), 30 | Ridge(), 31 | TSNE(n_components=2) 32 | ] 33 | 34 | sc = StackedClassifier(bclf, 35 | clfs, 36 | n_folds=3, 37 | verbose=0, 38 | stack_by_proba=True, 39 | oob_score_flag=True, 40 | ) 41 | 42 | gb = GradientBoostingClassifier(n_estimators=25, random_state=1) 43 | 44 | # cross validation 45 | sc_score = 0 46 | gb_score = 0 47 | n_folds = 3 48 | for train_idx, test_idx in StratifiedKFold(n_folds).split(iris.data, iris.target): 49 | xs_train = iris.data[train_idx] 50 | y_train = iris.target[train_idx] 51 | xs_test = iris.data[test_idx] 52 | y_test = iris.target[test_idx] 53 | 54 | sc.fit(xs_train, y_train) 55 | print('oob_score: {0}'.format(sc.oob_score_)) 56 | sc_score += sc.score(xs_test, y_test) 57 | gb.fit(xs_train, y_train) 58 | gb_score += gb.score(xs_test, y_test) 59 | 60 | sc_score /= n_folds 61 | print('Stacked Classfier score: {0}'.format(sc_score)) 62 | gb_score /= n_folds 63 | print('Gradient Boosting Classfier score: {0}'.format(gb_score)) 64 | -------------------------------------------------------------------------------- /stacked_generalization/example/fwls_regression.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets, metrics, preprocessing 2 | from stacked_generalization.lib.stacking import FWLSRegressor 3 | from sklearn.ensemble import RandomForestRegressor 4 | from sklearn.ensemble import GradientBoostingRegressor 5 | from sklearn.ensemble import ExtraTreesRegressor 6 | from sklearn.linear_model import LinearRegression, Ridge 7 | import numpy as np 8 | 9 | 10 | boston = datasets.load_boston() 11 | X = preprocessing.StandardScaler().fit_transform(boston.data) 12 | Y = boston.target 13 | 14 | X_train = X[:200] 15 | Y_train = Y[:200] 16 | X_test = X[200:] 17 | Y_test = Y[200:] 18 | 19 | breg = LinearRegression() 20 | regs = [RandomForestRegressor(n_estimators=50, random_state=1), 21 | GradientBoostingRegressor(n_estimators=25, random_state=1), 22 | Ridge(), 23 | ExtraTreesRegressor(n_estimators=50), 24 | ] 25 | feature_func = lambda x: np.c_[np.ones((x.shape[0], 1)), 26 | x[:, 1].reshape((x.shape[0], 1)), 27 | x[:, 6].reshape((x.shape[0], 1)),] 28 | 29 | sr = FWLSRegressor(breg, 30 | regs, 31 | feature_func, 32 | n_folds=3, 33 | verbose=0, 34 | oob_score_flag=False) 35 | 36 | sr.fit(X_train, Y_train) 37 | score = metrics.mean_squared_error(sr.predict(X_test), Y_test) 38 | print ("MSE of stacked regressor: %f" % score) 39 | -------------------------------------------------------------------------------- /stacked_generalization/example/joblibed_classification.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | from sklearn.model_selection import StratifiedKFold 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.utils.validation import check_random_state 5 | from stacked_generalization.lib.joblibed import JoblibedClassifier 6 | 7 | 8 | iris = datasets.load_iris() 9 | rng = check_random_state(0) 10 | perm = rng.permutation(iris.target.size) 11 | iris.data = iris.data[perm] 12 | iris.target = iris.target[perm] 13 | 14 | # Joblibed model 15 | rf = RandomForestClassifier(n_estimators=40, 16 | criterion='gini', 17 | random_state=1) 18 | clf = JoblibedClassifier(rf, "rf") 19 | 20 | 21 | train_idx, test_idx = list(StratifiedKFold(3).split(iris.data, iris.target))[0] 22 | 23 | xs_train = iris.data[train_idx] 24 | y_train = iris.target[train_idx] 25 | xs_test = iris.data[test_idx] 26 | y_test = iris.target[test_idx] 27 | 28 | 29 | print("First fit and prediction (not cached).") 30 | clf.fit(xs_train, y_train, train_idx) 31 | score = clf.score(xs_test, y_test, test_idx) 32 | print('Classfier score: {0}'.format(score)) 33 | 34 | print("Second fit and prediction (load cache).") 35 | clf.fit(xs_train, y_train, train_idx) 36 | score = clf.score(xs_test, y_test, test_idx) 37 | print('Classfier score: {0}'.format(score)) 38 | -------------------------------------------------------------------------------- /stacked_generalization/example/kaggle_titanic.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import StratifiedKFold 2 | from sklearn import preprocessing 3 | from sklearn.linear_model import LogisticRegression, Ridge 4 | from sklearn.preprocessing import LabelBinarizer 5 | from stacked_generalization.lib.stacking import StackedClassifier 6 | from sklearn.ensemble import RandomForestClassifier 7 | from sklearn.ensemble import ExtraTreesClassifier 8 | from sklearn.ensemble import GradientBoostingClassifier 9 | from sklearn.neighbors import KNeighborsClassifier 10 | 11 | import pandas as pd 12 | import numpy as np 13 | import re 14 | 15 | class DataReader(object): 16 | def __init__(self, file_name): 17 | self.file_name = file_name 18 | 19 | def disp_hist(self, data, label, bins): 20 | temp = [i[label].dropna() for i in data] 21 | plt.hist(temp, histtype='barstacked', bins=bins) 22 | plt.show() 23 | 24 | def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True): 25 | def get_title(name): 26 | title_search = re.search(' ([A-Za-z]+)\.', name) 27 | if title_search: 28 | return title_search.group(1) 29 | return "" 30 | 31 | def normalize_fare(data): 32 | new_data = None 33 | for embarked in (0, 1, 2): 34 | temp = data[data.Embarked == embarked] 35 | temp['Fare'] /= temp['Fare'].values.mean() 36 | if new_data is None: 37 | new_data = temp 38 | else: 39 | new_data = pd.concat([new_data, temp]) 40 | new_data = new_data.sort('PassengerId') 41 | return new_data 42 | 43 | data = pd.read_csv(self.file_name).replace('male',0).replace('female',1) 44 | data['Age'].fillna(data.Age.median(), inplace=True) 45 | data['Fare'].fillna(data.Fare.median(), inplace=True) 46 | data['FamilySize'] = data['SibSp'] + data['Parch'] + 1 47 | data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2) 48 | data['Embarked'].fillna(0, inplace=True) 49 | if norm_fare: 50 | data = normalize_fare(data) 51 | 52 | # Get all the titles and print how often each one occurs. 53 | titles = data["Name"].apply(get_title) 54 | print(pd.value_counts(titles)) 55 | 56 | # Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles. 57 | title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2} 58 | for k,v in title_mapping.items(): 59 | titles[titles == k] = v 60 | 61 | # Add in the title column. 62 | data['Title'] = titles 63 | data['Title'].fillna(1, inplace=True) 64 | #data['Pos'] = data["Title"] + data['Pclass'] 65 | if drop: 66 | #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1) 67 | data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1) 68 | #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1) 69 | print(data.keys()) 70 | if title_to_onehot: 71 | self.encode(data, 'Title', [i for i in range(1, 11)]) 72 | data = data.drop(['Title'], axis=1) 73 | return data 74 | 75 | def encode(self, data, label, value_set=None): 76 | le =LabelBinarizer() 77 | if value_set is None: 78 | encoded = le.fit_transform(data[label]) 79 | else: 80 | le.fit(value_set) 81 | encoded = le.transform(data[label]) 82 | for i in range(encoded.shape[1]): 83 | new_label = '{0}_is_{1}'.format(label, i) 84 | data[new_label] = encoded[:,i] 85 | 86 | def split_by_label(self, data, label='Survived'): 87 | split_data = [] 88 | for element in set(data[label]): 89 | split_data.append(data[data[label]==element]) 90 | return split_data 91 | 92 | def get_sample(self, N=600, scale=False): 93 | all_data = self.pre_process(self.file_name) 94 | #print('data_type: ' + str(all_data.dtypes)) 95 | all_data = all_data.values 96 | xs = all_data[:, 2:] 97 | y = all_data[:, 1] 98 | if scale: 99 | xs = preprocessing.scale(xs) 100 | if N != -1: 101 | perm = np.random.permutation(xs.shape[0]) 102 | xs = xs[perm] 103 | y = y[perm] 104 | xs_train, xs_test = np.split(xs, [N]) 105 | y_train, y_test = np.split(y, [N]) 106 | return xs_train, xs_test, y_train, y_test 107 | else: 108 | return xs, y 109 | 110 | def summarize_about_same_ticket(self, data): 111 | data = data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Cabin', 'FamilySize'], axis=1) 112 | for num in data[data.Age <= 5.0]['Ticket']: 113 | print('num:' + num) 114 | print(data[data.Ticket == num]) 115 | 116 | 117 | class TestDataReader(DataReader): 118 | def get_sample(self, N=-1): 119 | all_data = self.pre_process(self.file_name) 120 | all_data = all_data.values 121 | xs = all_data[:, 1:] 122 | pid = all_data[:, 0] 123 | return pid, xs 124 | 125 | def write_result(pid, output, suffix=''): 126 | import csv 127 | import datetime 128 | suffix += datetime.datetime.today().strftime("%Y-%m-%d-%H-%M-%S") 129 | with open("predict_result_data_{0}.csv".format(suffix), "w") as f: 130 | writer = csv.writer(f, lineterminator='\n') 131 | writer.writerow(["PassengerId", "Survived"]) 132 | for pid, survived in zip(pid.astype(int), output.astype(int)): 133 | writer.writerow([pid, survived]) 134 | 135 | if __name__ == '__main__': 136 | import os 137 | if not os.path.isfile('train.csv'): 138 | raise Exception('This example is data analysis for Kaggle Titanic Competition.' + 139 | 'For trying this example, you should download "train.csv" from https://www.kaggle.com/c/titanic.') 140 | 141 | train = True 142 | full_cv = True 143 | test = False 144 | 145 | train_dr = DataReader('train.csv') 146 | bclf = LogisticRegression(random_state=1) 147 | clfs = [ 148 | RandomForestClassifier(n_estimators=50, criterion = 'gini', random_state=1), 149 | ExtraTreesClassifier(n_estimators=50, criterion = 'gini', random_state=1), 150 | ExtraTreesClassifier(n_estimators=50, criterion = 'gini', random_state=2), 151 | GradientBoostingClassifier(n_estimators=25, random_state=1), 152 | GradientBoostingClassifier(n_estimators=40, random_state=1), 153 | Ridge(random_state=1), 154 | #KNeighborsClassifier(n_neighbors=4) 155 | #LogisticRegression(random_state=1) 156 | ] 157 | sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=2) 158 | #fsl = FWSLClassifier(bclf, clfs, feature=xs_train[:, 0]) 159 | if train:# evalute by hold-out and out-of-bugs 160 | sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=2, oob_score_flag=True) 161 | xs_train, xs_test, y_train, y_test = train_dr.get_sample() 162 | sl.fit(xs_train, y_train) 163 | score = sl.score(xs_test, y_test) 164 | print('score: {0}'.format(score)) 165 | print('oob_score: {0}'.format(sl.oob_score_)) 166 | if full_cv: #cross validation 167 | sl = StackedClassifier(bclf, clfs, oob_score_flag=False,verbose=2) 168 | xs_train, y_train = train_dr.get_sample(-1) 169 | score = [] 170 | for train_index, test_index in StratifiedKFold(3).split(xs_train, y_train): 171 | sl.fit(xs_train[train_index], y_train[train_index]) 172 | score.append(sl.score(xs_train[test_index], y_train[test_index])) 173 | print('full-cv score: {0}'.format(score)) 174 | if test: #to make pb leader board data. 175 | xs_train, y_train = train_dr.get_sample(-1) 176 | sl.fit(xs_train, y_train) 177 | test_dr = TestDataReader('test.csv') 178 | pid, xs_test = test_dr.get_sample(-1) 179 | output = sl.predict(xs_test) 180 | write_result(pid, output, sl.tostr()) -------------------------------------------------------------------------------- /stacked_generalization/example/simple_regression.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets, metrics, preprocessing 2 | from stacked_generalization.lib.stacking import StackedRegressor 3 | from sklearn.ensemble import RandomForestRegressor 4 | from sklearn.ensemble import GradientBoostingRegressor 5 | from sklearn.ensemble import ExtraTreesRegressor 6 | from sklearn.linear_model import LinearRegression, Ridge 7 | from sklearn.manifold import TSNE 8 | 9 | 10 | boston = datasets.load_boston() 11 | X = preprocessing.StandardScaler().fit_transform(boston.data) 12 | Y = boston.target 13 | 14 | X_train = X[:200] 15 | Y_train = Y[:200] 16 | X_test = X[200:] 17 | Y_test = Y[200:] 18 | 19 | breg = LinearRegression() 20 | regs = [RandomForestRegressor(n_estimators=50, random_state=1), 21 | GradientBoostingRegressor(n_estimators=25, random_state=1), 22 | GradientBoostingRegressor(n_estimators=30, random_state=2), 23 | Ridge(), 24 | ExtraTreesRegressor(n_estimators=50), 25 | TSNE(n_components=2) 26 | ] 27 | 28 | sr = StackedRegressor(breg, 29 | regs, 30 | n_folds=3, 31 | verbose=0, 32 | oob_score_flag=False) 33 | sr.fit(X_train, Y_train) 34 | score = metrics.mean_squared_error(sr.predict(X_test), Y_test) 35 | print ("MSE of stacked regressor: %f" % score) 36 | #print ("OOB of stacked regressor: %f" % sr.oob_score_) 37 | 38 | gb = GradientBoostingRegressor(n_estimators=25, random_state=1) 39 | gb.fit(X_train, Y_train) 40 | score = metrics.mean_squared_error(gb.predict(X_test), Y_test) 41 | print ("MSE of gradient boosting regressor: %f" % score) -------------------------------------------------------------------------------- /stacked_generalization/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fukatani/stacked_generalization/bd1e7aa7f090e6902cfbd389f9cd6500d7389954/stacked_generalization/lib/__init__.py -------------------------------------------------------------------------------- /stacked_generalization/lib/joblibed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, ClassifierMixin, clone, RegressorMixin 3 | from sklearn.externals import joblib 4 | import stacked_generalization.lib.util as util 5 | import os 6 | 7 | 8 | class BaseJoblibed(BaseEstimator): 9 | """Base class for joblibed learner. 10 | 11 | Warning: This class should not be used directly. Use derived classes 12 | instead. 13 | """ 14 | def __init__(self, 15 | estimator, 16 | prefix, 17 | skip_refit=True, 18 | cache_dir='temp/'): 19 | self.estimator = estimator 20 | self.prefix = prefix 21 | self.estimator.id = 'j' + prefix 22 | self.skip_refit = skip_refit 23 | self.cache_dir = cache_dir 24 | if self.cache_dir and not os.path.isdir(self.cache_dir): 25 | os.mkdir(self.cache_dir) 26 | 27 | def fit(self, xs_train, y_train, index=None): 28 | dump_file = "" 29 | if index is not None: 30 | dump_file = util.get_cache_file(self.estimator.id, 31 | index, 32 | cache_dir=self.cache_dir, 33 | suffix='pkl') 34 | if self.skip_refit and os.path.isfile(dump_file): 35 | if index is not None: 36 | self.estimator = joblib.load(dump_file) 37 | else: 38 | self.estimator.fit(xs_train, y_train) 39 | if index is not None: 40 | joblib.dump(self.estimator, dump_file, compress=True) 41 | return self 42 | 43 | 44 | class JoblibedClassifier(BaseJoblibed, ClassifierMixin): 45 | """A joblibed classifier. 46 | 47 | Parameters 48 | ---------- 49 | estimator : cache target model. 50 | prefix : file prefix. 51 | 52 | """ 53 | def predict_proba(self, xs_test, index=None): 54 | """Predict class probabilities for X. 55 | 56 | The predicted class probabilities of an input sample is computed. 57 | 58 | Parameters 59 | ---------- 60 | X : array-like or sparse matrix of shape = [n_samples, n_features] 61 | The input samples. 62 | 63 | Returns 64 | ------- 65 | p : array of shape = [n_samples, n_classes]. 66 | The class probabilities of the input samples. 67 | """ 68 | return util.saving_predict_proba(self.estimator, 69 | xs_test, 70 | index, 71 | self.cache_dir) 72 | 73 | def predict(self, X, index=None): 74 | """Predict class for X. 75 | 76 | The predicted class of an input sample is a vote by the JoblibedClassifier. 77 | 78 | Parameters 79 | ---------- 80 | X : array-like or sparse matrix of shape = [n_samples, n_features] 81 | The input samples. Internally, it will be converted to 82 | ``dtype=np.float32`` and if a sparse matrix is provided 83 | to a sparse ``csr_matrix``. 84 | 85 | Returns 86 | ------- 87 | y : array of shape = [n_samples] 88 | The predicted classes. 89 | """ 90 | proba = self.predict_proba(X, index) 91 | return np.argmax(proba, axis=1) 92 | 93 | def score(self, X, y, index=None, sample_weight=None): 94 | from sklearn.metrics import accuracy_score 95 | return accuracy_score(y, 96 | self.predict(X, index), 97 | sample_weight=sample_weight) 98 | 99 | 100 | class JoblibedRegressor(BaseJoblibed, RegressorMixin): 101 | """A joblibed regressor. 102 | 103 | Parameters 104 | ---------- 105 | estimator : cache target model. 106 | prefix : file prefix. 107 | 108 | """ 109 | def predict(self, xs_test, index=None): 110 | return util.saving_predict(self.estimator, 111 | xs_test, 112 | index, 113 | self.cache_dir) 114 | -------------------------------------------------------------------------------- /stacked_generalization/lib/stacking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import StratifiedKFold, KFold 3 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone 4 | from stacked_generalization.lib.util import numpy_c_concatenate 5 | from stacked_generalization.lib.util import multiple_feature_weight 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.metrics import mean_squared_error 8 | from collections import OrderedDict 9 | from sklearn.preprocessing import LabelBinarizer 10 | from sklearn.externals import joblib 11 | from stacked_generalization.lib import util 12 | import os 13 | 14 | 15 | class BaseStacked(BaseEstimator): 16 | """Base class for joblibed learner. 17 | 18 | Warning: This class should not be used directly. Use derived classes 19 | instead. 20 | """ 21 | def _fit_child(self, skf, xs_train, y_train): 22 | """Build stage0 models from the training set (xs_train, y_train). 23 | 24 | Parameters 25 | ---------- 26 | skf: StratifiedKFold-like iterator 27 | Use for cross validation blending. 28 | 29 | xs_train : array-like or sparse matrix of shape = [n_samples, n_features] 30 | The training input samples. 31 | 32 | y_train : array-like, shape = [n_samples] 33 | The target values (class labels in classification). 34 | 35 | Returns 36 | ------- 37 | blend_train : array-like, shape = [n_samples] 38 | For stage1 model training. 39 | blend_test : array-like, shape = [n_samples] 40 | If you use TwoStageKFold, blended sample for test will be prepared. 41 | """ 42 | blend_train = None 43 | blend_test = None 44 | for j, clf in enumerate(self.clfs): 45 | self._out_to_console('Training classifier [{0}]'.format(j), 0) 46 | all_learner_key = str(type(clf)) + str(j) 47 | self.all_learner[all_learner_key] = [] 48 | blend_train_j = None 49 | for i, (train_index, cv_index) in enumerate(skf): 50 | now_learner = clone(clf) 51 | self.all_learner[all_learner_key].append(now_learner) 52 | xs_now_train = xs_train[train_index] 53 | y_now_train = y_train[train_index] 54 | xs_cv = xs_train[cv_index] 55 | #y_cv = y_train[cv_index] no use 56 | 57 | if not hasattr(now_learner, 'id'): 58 | now_learner.id = self.get_stage0_id(now_learner) 59 | 60 | dump_file = util.get_cache_file(now_learner.id, 61 | cv_index, 62 | suffix='pkl') 63 | if self.save_stage0 and self._is_saved(now_learner, cv_index): 64 | print('Prediction cache exists: skip fitting.') 65 | now_learner = joblib.load(dump_file) 66 | self.all_learner[all_learner_key][-1] = now_learner 67 | else: 68 | self._out_to_console('Fold [{0}]'.format(i), 0) 69 | now_learner.fit(xs_now_train, y_now_train) 70 | if self.save_stage0: 71 | joblib.dump(now_learner, dump_file, compress=True) 72 | 73 | if blend_train_j is None: 74 | blend_train_j = self._get_blend_init(y_train, now_learner) 75 | blend_train_j[cv_index] = self._get_child_predict(now_learner, xs_cv, cv_index) 76 | blend_train = numpy_c_concatenate(blend_train, blend_train_j) 77 | return blend_train, blend_test 78 | 79 | def fit(self, xs_train, y_train): 80 | """Build a stacked classfier from the training set (xs_train, y_train). 81 | 82 | Parameters 83 | ---------- 84 | xs_train : array-like or sparse matrix of shape = [n_samples, n_features] 85 | The training input samples. 86 | 87 | y_train : array-like, shape = [n_samples] 88 | The target values (class labels in classification). 89 | 90 | Returns 91 | ------- 92 | self : object 93 | Returns self. 94 | """ 95 | self.n_classes_ = np.unique(y_train).shape[0] 96 | 97 | # Ready for cross validation 98 | skf = self._make_kfold(xs_train, y_train) 99 | self._out_to_console('xs_train.shape = {0}'.format(xs_train.shape), 1) 100 | 101 | #fit stage0 models. 102 | blend_train, _ = self._fit_child(skf, xs_train, y_train) 103 | blend_train = self._pre_propcess(blend_train, xs_train) 104 | 105 | #calc out of bugs score 106 | if self.oob_score_flag: 107 | self.calc_oob_score(blend_train, y_train, skf) 108 | 109 | # blending 110 | self._out_to_csv('blend_train', blend_train, 2) 111 | self._out_to_csv('y_train', y_train, 2) 112 | self.bclf.fit(blend_train, y_train) 113 | 114 | self._out_to_console('xs_train.shape = {0}'.format(xs_train.shape), 1) 115 | self._out_to_console('blend_train.shape = {0}'.format(blend_train.shape), 1) 116 | 117 | return self 118 | 119 | def _is_saved(self, model, index): 120 | model_id = self.get_stage0_id(model) 121 | return os.path.isfile(util.get_cache_file(model_id, index)) 122 | 123 | def _make_blend_test(self, xs_test, index=None): 124 | """Make blend sample for test. 125 | 126 | Parameters 127 | ---------- 128 | xs_test : array-like or sparse matrix of shape = [n_samples, n_features] 129 | The input samples. 130 | 131 | Returns 132 | ------- 133 | blend_test : array of shape = [n_samples, n_stage0_models]. 134 | Calc as the mean of the predictions of the cross validation set. 135 | """ 136 | blend_test = None 137 | for clfs in self.all_learner.values(): 138 | blend_test_j = None 139 | for clf in clfs: 140 | blend_test_j_temp = self._get_child_predict(clf, xs_test, index) 141 | if blend_test_j is None: 142 | blend_test_j = blend_test_j_temp 143 | else: 144 | blend_test_j += blend_test_j_temp 145 | blend_test_j = blend_test_j / len(clfs) #convert to mean 146 | blend_test = numpy_c_concatenate(blend_test, blend_test_j) 147 | return blend_test 148 | 149 | def _get_child_predict(self, clf, X, index=None): 150 | if self.stack_by_proba and hasattr(clf, 'predict_proba'): 151 | if self.save_stage0 and index is not None: 152 | proba = util.saving_predict_proba(clf, X, index) 153 | else: 154 | proba = clf.predict_proba(X) 155 | return proba[:, 1:] 156 | elif hasattr(clf, 'predict'): 157 | predict_result = clf.predict(X) 158 | if isinstance(clf, ClassifierMixin): 159 | lb = LabelBinarizer() 160 | lb.fit(predict_result) 161 | return lb.fit_transform(predict_result) 162 | else: 163 | return predict_result.reshape((predict_result.size, 1)) 164 | else: 165 | return clf.fit_transform(X) 166 | 167 | def _get_blend_init(self, y_train, clf): 168 | if self.stack_by_proba and hasattr(clf, 'predict_proba'): 169 | width = self.n_classes_ - 1 170 | elif hasattr(clf, 'predict') and isinstance(clf, ClassifierMixin): 171 | width = self.n_classes_ 172 | elif hasattr(clf, 'predict'): 173 | width = 1 174 | elif hasattr(clf, 'n_components'): 175 | width = clf.n_components 176 | else: 177 | raise Exception('Unimplemented for {0}'.format(type(clf))) 178 | return np.zeros((y_train.size, width)) 179 | 180 | 181 | def _out_to_console(self, message, limit_verbose): 182 | if self.verbose > limit_verbose: 183 | print(message) 184 | 185 | def _out_to_csv(self, file_name, data, limit_verbose): 186 | """write_out numpy array to csv""" 187 | import os 188 | file_name = 'data/{0}.csv'.format(file_name) 189 | if self.verbose > limit_verbose: 190 | while True: 191 | if os.path.isfile(file_name): 192 | file_name = file_name.replace('.csv', '_.csv') 193 | else: 194 | break 195 | np.savetxt(file_name, data, delimiter=",") 196 | 197 | def _pre_propcess(self, blend, X): 198 | return numpy_c_concatenate(blend, X) 199 | 200 | def get_stage0_id(self, model): 201 | return self.save_dir + util.get_model_id(model) 202 | 203 | def calc_oob_score(self, blend_train, y_train, skf): 204 | """Compute out-of-bag score""" 205 | if self.oob_metrics.__name__ == 'log_loss': 206 | y_predict = np.zeros((y_train.size, self.n_classes_)) 207 | else: 208 | y_predict = np.zeros(y_train.shape) 209 | for train_index, cv_index in skf: 210 | self.bclf.fit(blend_train[train_index], y_train[train_index]) 211 | if self.oob_metrics.__name__ == 'log_loss': 212 | y_predict[cv_index] = self.bclf.predict_proba(blend_train[cv_index]) 213 | else: 214 | y_predict[cv_index] = self.bclf.predict(blend_train[cv_index]) 215 | self.oob_score_ = self.oob_metrics(y_train, y_predict) 216 | self._out_to_console('oob_score: {0}'.format(self.oob_score_), 0) 217 | 218 | 219 | class StackedClassifier(BaseStacked, ClassifierMixin): 220 | """A stacking classifier. 221 | 222 | Parameters 223 | ---------- 224 | bclf : stage1 model for stacking. 225 | 226 | clfs : list of stage0 machine learning models. 227 | 228 | n_folds : integer 229 | Number of folds at stage0 blending. 230 | 231 | Kfold: scikit-learn KFold like 232 | If Any Kfold is assigned, it will be used in blending. 233 | 234 | stack_by_proba : boolean 235 | If True and stage0 machine learning model has 'predict_proba', 236 | result of it is used in blending. 237 | If not, result of 'predict' is used in blending. 238 | 239 | oob_score_flag : boolean 240 | If True, stacked clssfier calc out-of-bugs score after fitting. 241 | You can evaluate model by this score (with out CV). 242 | 243 | oob_metrics : metrics for evaluation oob. 244 | 245 | verbose : int, optional (default=0) 246 | Controls the verbosity of the tree building process. 247 | 248 | .. [1] L. Breiman, "Stacked Regressions", Machine Learning, 24, 49-64 (1996). 249 | """ 250 | def __init__(self, 251 | bclf, 252 | clfs, 253 | n_folds=3, 254 | stack_by_proba=True, 255 | oob_score_flag=False, 256 | oob_metrics=accuracy_score, 257 | Kfold=None, 258 | verbose=0, 259 | save_stage0=False, 260 | save_dir=''): 261 | self.n_folds = n_folds 262 | self.clfs = clfs 263 | self.bclf = bclf 264 | self.stack_by_proba = stack_by_proba 265 | self.all_learner = OrderedDict() 266 | self.oob_score_flag = oob_score_flag 267 | self.oob_metrics = oob_metrics 268 | self.verbose = verbose 269 | self.MyKfold = Kfold 270 | self.save_stage0 = save_stage0 271 | self.save_dir = save_dir 272 | for clf in clfs: 273 | if not hasattr(clf, 'id'): 274 | clf.id = self.save_dir + util.get_model_id(clf) 275 | 276 | def predict_proba(self, xs_test, index=None): 277 | """Predict class probabilities for X. 278 | 279 | The predicted class probabilities of an input sample is computed. 280 | 281 | Parameters 282 | ---------- 283 | X : array-like or sparse matrix of shape = [n_samples, n_features] 284 | The input samples. 285 | 286 | Returns 287 | ------- 288 | p : array of shape = [n_samples, n_classes]. 289 | The class probabilities of the input samples. 290 | """ 291 | blend_test = self._make_blend_test(xs_test, index) 292 | blend_test = self._pre_propcess(blend_test, xs_test) 293 | return self.bclf.predict_proba(blend_test) 294 | 295 | def _make_kfold(self, X, Y): 296 | if self.MyKfold is not None: 297 | return self.MyKfold 298 | else: 299 | return list(StratifiedKFold(self.n_folds).split(X, Y)) 300 | 301 | def predict(self, X, index=None): 302 | """Predict class for X. 303 | 304 | The predicted class of an input sample is a vote by the StackedClassifier. 305 | 306 | Parameters 307 | ---------- 308 | X : array-like or sparse matrix of shape = [n_samples, n_features] 309 | The input samples. Internally, it will be converted to 310 | ``dtype=np.float32`` and if a sparse matrix is provided 311 | to a sparse ``csr_matrix``. 312 | 313 | Returns 314 | ------- 315 | y : array of shape = [n_samples] 316 | The predicted classes. 317 | """ 318 | proba = self.predict_proba(X, index) 319 | return np.argmax(proba, axis=1) 320 | 321 | 322 | class StackedRegressor(BaseStacked, RegressorMixin): 323 | def __init__(self, 324 | bclf, 325 | clfs, 326 | n_folds=3, 327 | oob_score_flag=False, 328 | oob_metrics=mean_squared_error, 329 | Kfold=None, 330 | verbose=0, 331 | save_stage0=False, 332 | save_dir=''): 333 | self.n_folds = n_folds 334 | self.clfs = clfs 335 | self.bclf = bclf 336 | self.all_learner = OrderedDict() 337 | self.oob_score_flag = oob_score_flag 338 | self.oob_metrics = oob_metrics 339 | self.verbose = verbose 340 | self.stack_by_proba = False 341 | self.save_stage0 = save_stage0 342 | self.save_dir = save_dir 343 | self.MyKfold = Kfold 344 | 345 | def predict(self, X, index=None): 346 | """ 347 | The predicted value of an input sample is a vote by the StackedRegressor. 348 | 349 | Parameters 350 | ---------- 351 | X : array-like or sparse matrix of shape = [n_samples, n_features] 352 | The input samples. Internally, it will be converted to 353 | ``dtype=np.float32`` and if a sparse matrix is provided 354 | to a sparse ``csr_matrix``. 355 | 356 | Returns 357 | ------- 358 | y : array of shape = [n_samples] 359 | The predicted values. 360 | """ 361 | blend_test = self._make_blend_test(X, index) 362 | blend_test = self._pre_propcess(blend_test, X) 363 | return self.bclf.predict(blend_test) 364 | 365 | def _make_kfold(self, X, Y): 366 | if self.MyKfold is not None: 367 | return self.MyKfold 368 | else: 369 | return list(KFold(self.n_folds).split(X, Y)) 370 | 371 | def _get_blend_init(self, y_train, clf): 372 | if hasattr(clf, 'predict'): 373 | width = 1 374 | elif hasattr(clf, 'n_components'): 375 | width = clf.n_components 376 | return np.zeros((y_train.size, width)) 377 | 378 | def _get_child_predict(self, clf, X, index=None): 379 | if hasattr(clf, 'predict'): 380 | if self.save_stage0 and index is not None: 381 | predict_result = util.saving_predict(clf, X, index) 382 | else: 383 | predict_result = clf.predict(X) 384 | return predict_result.reshape(predict_result.size, 1) 385 | else: 386 | return clf.fit_transform(X) 387 | 388 | 389 | class FWLSClassifier(StackedClassifier): 390 | """ 391 | Feature Weighted Linear Stacking Classfier. 392 | References 393 | ---------- 394 | 395 | .. [1] J. Sill1 et al, "Feature Weighted Linear Stacking", https://arxiv.org/abs/0911.0460, 2009. 396 | """ 397 | def __init__(self, 398 | bclf, 399 | clfs, 400 | feature_func, 401 | n_folds=3, 402 | stack_by_proba=True, 403 | oob_score_flag=False, 404 | oob_metrics=accuracy_score, 405 | Kfold=None, 406 | verbose=0, 407 | save_stage0=False, 408 | save_dir=''): 409 | super(FWLSClassifier, self).__init__(bclf, 410 | clfs, 411 | n_folds, 412 | stack_by_proba, 413 | oob_score_flag, 414 | oob_metrics, 415 | Kfold, 416 | verbose, 417 | save_stage0, 418 | save_dir) 419 | self.feature_func = feature_func 420 | 421 | def _pre_propcess(self, blend, X): 422 | X = multiple_feature_weight(blend, self.feature_func(X)) 423 | return X 424 | 425 | class FWLSRegressor(StackedRegressor): 426 | """ 427 | Feature Weighted Linear Stacking Regressor. 428 | References 429 | ---------- 430 | 431 | .. [1] J. Sill1 et al, "Feature Weighted Linear Stacking", https://arxiv.org/abs/0911.0460, 2009. 432 | """ 433 | def __init__(self, 434 | bclf, 435 | clfs, 436 | feature_func, 437 | n_folds=3, 438 | oob_score_flag=False, 439 | oob_metrics=mean_squared_error, 440 | Kfold=None, 441 | verbose=0, 442 | save_stage0=False, 443 | save_dir=''): 444 | super(FWLSRegressor, self).__init__(bclf, 445 | clfs, 446 | n_folds, 447 | oob_score_flag, 448 | oob_metrics, 449 | Kfold, 450 | verbose, 451 | save_stage0, 452 | save_dir) 453 | 454 | self.feature_func = feature_func 455 | 456 | def _pre_propcess(self, blend, X): 457 | X = multiple_feature_weight(blend, self.feature_func(X)) 458 | return X 459 | -------------------------------------------------------------------------------- /stacked_generalization/lib/test/test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | from sklearn import datasets 5 | from sklearn.utils.validation import check_random_state 6 | from stacked_generalization.lib.stacking import StackedClassifier, FWLSClassifier 7 | from stacked_generalization.lib.stacking import StackedRegressor, FWLSRegressor 8 | from stacked_generalization.lib.joblibed import JoblibedClassifier, JoblibedRegressor 9 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 10 | from sklearn.ensemble import ExtraTreesClassifier 11 | from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.linear_model import RidgeClassifier 14 | from sklearn.linear_model import Ridge 15 | from sklearn.linear_model import LinearRegression 16 | from sklearn.metrics import mean_squared_error, log_loss, accuracy_score 17 | from sklearn.utils.testing import assert_less 18 | import numpy as np 19 | from stacked_generalization.lib.util import numpy_c_concatenate 20 | from stacked_generalization.lib.util import saving_predict_proba 21 | from stacked_generalization.lib.util import get_model_id 22 | from stacked_generalization.lib.util import multiple_feature_weight 23 | from sklearn.model_selection import StratifiedKFold 24 | from numpy.testing import assert_allclose 25 | import glob 26 | 27 | 28 | class TestStackedClassfier(unittest.TestCase): 29 | def setUp(self): 30 | iris = datasets.load_iris() 31 | rng = check_random_state(0) 32 | perm = rng.permutation(iris.target.size) 33 | iris.data = iris.data[perm] 34 | iris.target = iris.target[perm] 35 | self.iris = iris 36 | 37 | def test_stacked_classfier_extkfold(self): 38 | bclf = LogisticRegression(random_state=1) 39 | clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), 40 | RidgeClassifier(random_state=1), 41 | ] 42 | sl = StackedClassifier(bclf, 43 | clfs, 44 | n_folds=3, 45 | verbose=0, 46 | Kfold=list(StratifiedKFold(3).split(self.iris.data, self.iris.target)), 47 | stack_by_proba=False, 48 | oob_score_flag=True, 49 | oob_metrics=log_loss) 50 | sl.fit(self.iris.data, self.iris.target) 51 | score = sl.score(self.iris.data, self.iris.target) 52 | self.assertGreater(score, 0.9, "Failed with score = {0}".format(score)) 53 | 54 | def test_stacked_classfier(self): 55 | bclf = LogisticRegression(random_state=1) 56 | clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), 57 | ExtraTreesClassifier(n_estimators=30, criterion = 'gini', random_state=3), 58 | GradientBoostingClassifier(n_estimators=25, random_state=1), 59 | RidgeClassifier(random_state=1), 60 | ] 61 | 62 | for n_folds, stack_by_proba in self.iter_for_stack_param(): 63 | sl = StackedClassifier(bclf, 64 | clfs, 65 | n_folds=n_folds, 66 | verbose=0, 67 | stack_by_proba=stack_by_proba, 68 | oob_score_flag=True) 69 | sl.fit(self.iris.data, self.iris.target) 70 | score = sl.score(self.iris.data, self.iris.target) 71 | self.assertGreater(score, 0.8, "Failed with score = {0}".format(score)) 72 | self.assertGreater(score, 0.8, "Failed with score = {0}".format(sl.oob_score_)) 73 | print('oob_score: {0} @n_folds={1}, stack_by_proba={2}' 74 | .format(sl.oob_score_, sl.n_folds, sl.stack_by_proba)) 75 | 76 | for csv_file in glob.glob("*.csv"): 77 | os.remove(csv_file) 78 | for csv_file in glob.glob("*.pkl"): 79 | os.remove(csv_file) 80 | 81 | sl = StackedClassifier(bclf, 82 | clfs, 83 | oob_score_flag=True, 84 | save_stage0=True) 85 | sl.fit(self.iris.data, self.iris.target) 86 | sl.score(self.iris.data, self.iris.target) 87 | self.assertGreater(score, 0.8, "Failed with score = {0}".format(score)) 88 | sl.fit(self.iris.data, self.iris.target) 89 | sl.score(self.iris.data, self.iris.target) 90 | self.assertGreater(score, 0.8, "Failed with score = {0}".format(score)) 91 | 92 | self.assertTrue(glob.glob('ExtraTreesClassifier_*.csv')) 93 | for csv_file in glob.glob("*.csv"): 94 | os.remove(csv_file) 95 | for csv_file in glob.glob("*.pkl"): 96 | os.remove(csv_file) 97 | 98 | def iter_for_stack_param(self): 99 | yield 2, True 100 | yield 4, True 101 | yield 2, False 102 | yield 3, False 103 | 104 | def test_stacked_regressor(self): 105 | bclf = LinearRegression() 106 | clfs = [RandomForestRegressor(n_estimators=50, random_state=1), 107 | GradientBoostingRegressor(n_estimators=25, random_state=1), 108 | Ridge(random_state=1)] 109 | 110 | # Friedman1 111 | X, y = datasets.make_friedman1(n_samples=1200, 112 | random_state=1, 113 | noise=1.0) 114 | X_train, y_train = X[:200], y[:200] 115 | X_test, y_test = X[200:], y[200:] 116 | 117 | sr = StackedRegressor(bclf, 118 | clfs, 119 | n_folds=3, 120 | verbose=0, 121 | oob_score_flag=True) 122 | sr.fit(X_train, y_train) 123 | mse = mean_squared_error(y_test, sr.predict(X_test)) 124 | assert_less(mse, 6.0) 125 | 126 | def test_concatenate(self): 127 | A = None 128 | B = np.array([[1,2],[3,4]]) 129 | np.testing.assert_equal(numpy_c_concatenate(A, B), B) 130 | A = np.array([[0], [1]]) 131 | np.testing.assert_equal(numpy_c_concatenate(A, B), [[0,1,2], [1,3,4]]) 132 | 133 | def test_save_prediction(self): 134 | model = RandomForestClassifier() 135 | model.id = get_model_id(model) 136 | model.fit(self.iris.data, self.iris.target) 137 | indexes = np.fromfunction(lambda x: x, (self.iris.data.shape[0], ), dtype=np.int32) 138 | saving_predict_proba(model, self.iris.data, indexes) 139 | any_file_removed = False 140 | for filename in os.listdir('.'): 141 | if filename.startswith('RandomForestClassifier'): 142 | os.remove(filename) 143 | any_file_removed = True 144 | self.assertTrue(any_file_removed) 145 | 146 | 147 | def test_fwls_classfier(self): 148 | feature_func = lambda x: np.ones(x.shape) 149 | bclf = LogisticRegression(random_state=1) 150 | clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), 151 | RidgeClassifier(random_state=1), 152 | ] 153 | sl = FWLSClassifier(bclf, 154 | clfs, 155 | feature_func=feature_func, 156 | n_folds=3, 157 | verbose=0, 158 | Kfold=list(StratifiedKFold(3).split(self.iris.data, self.iris.target)), 159 | stack_by_proba=False) 160 | sl.fit(self.iris.data, self.iris.target) 161 | score = sl.score(self.iris.data, self.iris.target) 162 | self.assertGreater(score, 0.9, "Failed with score = {0}".format(score)) 163 | 164 | def test_fwls_regressor(self): 165 | feature_func = lambda x: np.ones(x.shape) 166 | bclf = LinearRegression() 167 | clfs = [RandomForestRegressor(n_estimators=50, random_state=1), 168 | GradientBoostingRegressor(n_estimators=25, random_state=1), 169 | Ridge(random_state=1)] 170 | 171 | # Friedman1 172 | X, y = datasets.make_friedman1(n_samples=1200, 173 | random_state=1, 174 | noise=1.0) 175 | X_train, y_train = X[:200], y[:200] 176 | X_test, y_test = X[200:], y[200:] 177 | 178 | sr = FWLSRegressor(bclf, 179 | clfs, 180 | feature_func, 181 | n_folds=3, 182 | verbose=0, 183 | oob_score_flag=True) 184 | sr.fit(X_train, y_train) 185 | mse = mean_squared_error(y_test, sr.predict(X_test)) 186 | assert_less(mse, 6.0) 187 | 188 | def test_multiple_feature_weight(self): 189 | A = np.array([[1,2],[3,4],[5,6]]) 190 | B = np.array([[1],[1],[1]]) 191 | C = multiple_feature_weight(A, B) 192 | np.testing.assert_equal(C, A) 193 | B = np.array([[2],[2],[2]]) 194 | C = multiple_feature_weight(A, B) 195 | np.testing.assert_equal(C, np.array([[2,4],[6,8],[10,12]])) 196 | B = np.array([[1,2],[2,1],[1,2]]) 197 | C = multiple_feature_weight(A, B) 198 | np.testing.assert_equal(C, np.array([[ 1, 2, 2, 4], 199 | [ 6, 3, 8, 4], 200 | [ 5, 10, 6, 12]])) 201 | 202 | class TestJoblibedClassfier(unittest.TestCase): 203 | def setUp(self): 204 | iris = datasets.load_iris() 205 | rng = check_random_state(0) 206 | iris.data = iris.data 207 | iris.target = iris.target 208 | self.iris = iris 209 | for csv_file in glob.glob("*.csv"): 210 | os.remove(csv_file) 211 | 212 | def test_classifier(self): 213 | index = [i for i in range(len(self.iris.data))] 214 | 215 | rf = RandomForestClassifier() 216 | jrf = JoblibedClassifier(rf, "rf", cache_dir='') 217 | jrf.fit(self.iris.data, self.iris.target, index) 218 | prediction = jrf.predict(self.iris.data, index) 219 | score = accuracy_score(self.iris.target, prediction) 220 | self.assertGreater(score, 0.9, "Failed with score = {0}".format(score)) 221 | 222 | rf = RandomForestClassifier(n_estimators=20) 223 | jrf = JoblibedClassifier(rf, "rf", cache_dir='') 224 | jrf.fit(self.iris.data, self.iris.target) 225 | index = [i for i in range(len(self.iris.data))] 226 | prediction2 = jrf.predict(self.iris.data, index) 227 | self.assertTrue((prediction == prediction2).all()) 228 | 229 | def test_regressor(self): 230 | X, y = datasets.make_friedman1(n_samples=1200, 231 | random_state=1, 232 | noise=1.0) 233 | X_train, y_train = X[:200], y[:200] 234 | index = [i for i in range(200)] 235 | 236 | rf = RandomForestRegressor() 237 | jrf = JoblibedRegressor(rf, "rfr", cache_dir='') 238 | jrf.fit(X_train, y_train, index) 239 | prediction = jrf.predict(X_train, index) 240 | mse = mean_squared_error(y_train, prediction) 241 | assert_less(mse, 6.0) 242 | 243 | rf = RandomForestRegressor(n_estimators=20) 244 | jrf = JoblibedRegressor(rf, "rfr", cache_dir='') 245 | jrf.fit(X_train, y_train, index) 246 | prediction2 = jrf.predict(X_train, index) 247 | assert_allclose(prediction, prediction2) 248 | 249 | 250 | if __name__ == '__main__': 251 | unittest.main() 252 | -------------------------------------------------------------------------------- /stacked_generalization/lib/util.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import StratifiedKFold 2 | import numpy as np 3 | from sklearn.externals import joblib 4 | import pandas as pd 5 | 6 | def multiple_feature_weight(blend, X): 7 | result = None 8 | for a_vec in blend.T: 9 | for b_vec in X.T: 10 | if result is None: 11 | result = a_vec * b_vec 12 | else: 13 | result = np.c_[result, a_vec * b_vec] 14 | return result 15 | 16 | def numpy_c_concatenate(A, B): 17 | if A is None: 18 | return B 19 | else: 20 | return np.c_[A, B] 21 | 22 | def saving_predict_proba(model, X, index, cache_dir=''): 23 | try: 24 | csv_file = get_cache_file(model.id, index, cache_dir) 25 | df = pd.read_csv(csv_file) 26 | proba = df.values[:, 1:] 27 | print("**** prediction is loaded from {0} ****".format(csv_file)) 28 | except IOError: 29 | proba = model.predict_proba(X) 30 | df = pd.DataFrame({'index': index}) 31 | for i in range(proba.shape[1]): 32 | df["prediction" + str(i)] = proba[:, i] 33 | #print(df) 34 | df.to_csv(csv_file, index=False) 35 | return proba 36 | 37 | def saving_predict(model, X, index, cache_dir=''): 38 | csv_file = get_cache_file(model.id, index,cache_dir) 39 | try: 40 | df = pd.read_csv(csv_file) 41 | prediction = df.values[:, 1:] 42 | prediction = prediction.reshape([prediction.size,]) 43 | print("**** prediction is loaded from {0} ****".format(csv_file)) 44 | except IOError: 45 | prediction = model.predict(X) 46 | df = pd.DataFrame({'index': index}) 47 | prediction.reshape([prediction.shape[-1],]) 48 | df["prediction"] = prediction 49 | #print(df) 50 | df.to_csv(csv_file, index=False) 51 | return prediction 52 | 53 | def get_model_id(model): 54 | model_type = str(type(model)) 55 | model_type = model_type[model_type.rfind(".")+1: model_type.rfind("'")] 56 | param_dict = model.get_params() 57 | ignore_list = ('n_jobs', 'oob_score', 'verbose', 'warm_start') 58 | new_param_dict = {} 59 | for key, value in sorted(param_dict.items(), key=lambda x: x[0]): 60 | i = 0 61 | if key in ignore_list: 62 | continue 63 | while True: 64 | new_key = key[0] + str(i) 65 | if not new_key in new_param_dict: 66 | new_param_dict[new_key] = value 67 | break 68 | i += 1 69 | model_type += str(new_param_dict) 70 | replace_dict = {'{': '_', 71 | '}': '', 72 | "'": "", 73 | '.': 'p', 74 | ',': '__', 75 | ':': '_', 76 | ' ': '', 77 | 'True': '1', 78 | 'False': '0', 79 | 'None': 'N', 80 | '=': '_', 81 | '(': '_', 82 | ')': '_', 83 | '\n': '_'} 84 | for key, value in replace_dict.items(): 85 | model_type = model_type.replace(key, value) 86 | if len(model_type) > 150: 87 | model_type = model_type[:150] 88 | return model_type 89 | 90 | def get_cache_file(model_id, index, cache_dir='', suffix='csv'): 91 | # Identify index trick. 92 | # If sum of first 20 index, recognize as the same index. 93 | if index is None: 94 | raise IOError 95 | if len(index) < 20: 96 | sum_index = sum(index) 97 | else: 98 | sum_index = sum(index[:20]) 99 | return "{0}{1}_{2}.{3}".format(cache_dir, 100 | model_id, 101 | sum_index, 102 | suffix) 103 | 104 | ##def saving_fit(learner, X, y, index): 105 | ## import os 106 | ## pkl_file = "{0}_{1}_{2}.pkl".format(learner.id, min(index), max(index)) 107 | ## try: 108 | ## learner = joblib.load(pkl_file) 109 | ## print("**** learner is loaded from {0} ****".format(pkl_file)) 110 | ## except IOError: 111 | ## learner.fit(X, y) 112 | ## joblib.dump(learner, pkl_file) 113 | ## return learner 114 | 115 | if __name__ == '__main__': 116 | temp = {'index': [0, 1], 'value': [2, 3]} 117 | df = pd.DataFrame(temp) 118 | print(df) 119 | df.to_csv('dum.csv', index=False) 120 | --------------------------------------------------------------------------------