├── .gitignore
├── .travis.yml
├── LICENSE
├── Readme.rst
├── setup.py
└── stacked_generalization
    ├── __init__.py
    ├── example
        ├── cross_validation_for_iris.py
        ├── fwls_regression.py
        ├── joblibed_classification.py
        ├── kaggle_titanic.py
        └── simple_regression.py
    └── lib
        ├── __init__.py
        ├── joblibed.py
        ├── stacking.py
        ├── test
            └── test.py
        └── util.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.pyc
3 | build/
4 | dist/
5 | stacked_generalization.egg-info/
6 | *.pkl
7 | pip_release.bat
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 2.7
 4 |   - 3.5
 5 | # Setup anaconda
 6 | before_install:
 7 |   - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
 8 |   - chmod +x miniconda.sh
 9 |   - ./miniconda.sh -b
10 |   - export PATH=/home/travis/miniconda2/bin:$PATH
11 |   - conda update --yes conda
12 |   # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda
13 |   - sudo rm -rf /dev/shm
14 |   - sudo ln -s /run/shm /dev/shm
15 | # Install packages
16 | install:
17 | #  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy scipy nose scikit-learn pandas
18 |   - conda install --yes atlas numpy scipy nose scikit-learn pandas
19 |   - python setup.py install
20 | script:
21 |   - cd stacked_generalization/lib/test
22 |   - python test.py
23 |   - cd ../../example
24 |   - python cross_validation_for_iris.py
25 |   - python simple_regression.py
26 |   - python joblibed_classification.py
27 |   - python fwls_regression.py
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Readme.rst:
--------------------------------------------------------------------------------
  1 | |Build Status|
  2 | 
  3 | stacked\_generalization
  4 | =======================
  5 | 
  6 | Implemented machine learning ***stacking technic[1]*** as handy library
  7 | in Python. Feature weighted linear stacking is also available. (See
  8 | https://github.com/fukatani/stacked\_generalization/tree/master/stacked\_generalization/example)
  9 | 
 10 | Including simple model cache system Joblibed claasifier and Joblibed
 11 | Regressor.
 12 | 
 13 | Feature
 14 | -------
 15 | 
 16 | 1) Any scikit-learn model is availavle for Stage 0 and Stage 1 model.
 17 | '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
 18 | 
 19 | And stacked model itself has the same interface as scikit-learn library.
 20 | ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
 21 | 
 22 | You can replace model such as *RandomForestClassifier* to *stacked
 23 | model* easily in your scripts. And multi stage stacking is also easy.
 24 | 
 25 | ex.
 26 | 
 27 | .. code:: python
 28 | 
 29 |     from stacked_generalization.lib.stacking import StackedClassifier
 30 |     from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 31 |     from sklearn.linear_model import LogisticRegression, RidgeClassifier
 32 |     from sklearn import datasets, metrics
 33 |     iris = datasets.load_iris()
 34 | 
 35 |     # Stage 1 model
 36 |     bclf = LogisticRegression(random_state=1)
 37 | 
 38 |     # Stage 0 models
 39 |     clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
 40 |             GradientBoostingClassifier(n_estimators=25, random_state=1),
 41 |             RidgeClassifier(random_state=1)]
 42 | 
 43 |     # same interface as scikit-learn
 44 |     sl = StackedClassifier(bclf, clfs)
 45 |     sl.fit(iris.target, iris.data)
 46 |     score = metrics.accuracy_score(iris.target, sl.predict(iris.data))
 47 |     print("Accuracy: %f" % score)
 48 | 
 49 | More detail example is here.
 50 | https://github.com/fukatani/stacked\_generalization/blob/master/stacked\_generalization/example/cross\_validation\_for\_iris.py
 51 | 
 52 | https://github.com/fukatani/stacked\_generalization/blob/master/stacked\_generalization/example/simple\_regression.py
 53 | 
 54 | 2) Evaluation model by out-of-bugs score.
 55 | '''''''''''''''''''''''''''''''''''''''''
 56 | 
 57 | Stacking technic itself uses CV to stage0. So if you use CV for entire
 58 | stacked model, ***each stage 0 model are fitted n\_folds squared
 59 | times.*** Sometimes its computational cost can be significent, therefore
 60 | we implemented CV only for stage1[2].
 61 | 
 62 | For example, when we get 3 blends (stage0 prediction), 2 blends are used
 63 | for stage 1 fitting. The remaining one blend is used for model test.
 64 | Repitation this cycle for all 3 blends, and averaging scores, we can get
 65 | oob (out-of-bugs) score ***with only n\_fold times stage0 fitting.***
 66 | 
 67 | ex.
 68 | 
 69 | .. code:: python
 70 | 
 71 |     sl = StackedClassifier(bclf, clfs, oob_score_flag=True)
 72 |     sl.fit(iris.data, iris.target)
 73 |     print("Accuracy: %f" % sl.oob_score_)
 74 | 
 75 | 3) Caching stage1 blend\_data and trained model. (optional)
 76 | '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
 77 | 
 78 | If cache is exists, recalculation for stage 0 will be skipped. This
 79 | function is useful for stage 1 tuning.
 80 | 
 81 | .. code:: python
 82 | 
 83 |     sl = StackedClassifier(bclf, clfs, save_stage0=True, save_dir='stack_temp')
 84 | 
 85 | Feature of Joblibed Classifier / Regressor
 86 | ------------------------------------------
 87 | 
 88 | Joblibed Classifier / Regressor is simple cache system for scikit-learn
 89 | machine learning model. You can use it easily by minimum code
 90 | modification.
 91 | 
 92 | At first fitting and prediction, model calculation is performed
 93 | normally. At the same time, model fitting result and prediction result
 94 | are saved as *.pkl* and *.csv* respectively.
 95 | 
 96 | **At second fitting and prediction, if cache is existence, model and
 97 | prediction results will be loaded from cache and never recalculation.**
 98 | 
 99 | e.g.
100 | 
101 | .. code:: python
102 | 
103 |     from sklearn import datasets
104 |     from sklearn.cross_validation import StratifiedKFold
105 |     from sklearn.ensemble import RandomForestClassifier
106 |     from stacked_generalization.lib.joblibed import JoblibedClassifier
107 | 
108 |     # Load iris
109 |     iris = datasets.load_iris()
110 | 
111 |     # Declaration of Joblibed model
112 |     rf = RandomForestClassifier(n_estimators=40)
113 |     clf = JoblibedClassifier(rf, "rf")
114 | 
115 |     train_idx, test_idx = list(StratifiedKFold(iris.target, 3))[0]
116 | 
117 |     xs_train = iris.data[train_idx]
118 |     y_train = iris.target[train_idx]
119 |     xs_test = iris.data[test_idx]
120 |     y_test = iris.target[test_idx]
121 | 
122 |     # Need to indicate sample for discriminating cache existence.
123 |     clf.fit(xs_train, y_train, train_idx)
124 |     score = clf.score(xs_test, y_test, test_idx)
125 | 
126 | See also
127 | https://github.com/fukatani/stacked\_generalization/blob/master/stacked\_generalization/lib/joblibed.py
128 | 
129 | Software Requirement
130 | --------------------
131 | 
132 | -  Python (2.7 or 3.5 or later)
133 | -  numpy
134 | -  scikit-learn
135 | -  pandas
136 | 
137 | Installation
138 | ------------
139 | 
140 | ::
141 | 
142 |     pip install stacked_generalization
143 | 
144 | License
145 | -------
146 | 
147 | MIT License. (http://opensource.org/licenses/mit-license.php)
148 | 
149 | Copyright
150 | ---------
151 | 
152 | Copyright (C) 2016, Ryosuke Fukatani
153 | 
154 | Many part of the implementation of stacking is based on the following.
155 | Thanks!
156 | https://github.com/log0/vertebral/blob/master/stacked\_generalization.py
157 | 
158 | Other
159 | -----
160 | 
161 | Any contributions (implement, documentation, test or idea...) are
162 | welcome.
163 | 
164 | References
165 | ----------
166 | 
167 | [1] L. Breiman, "Stacked Regressions", Machine Learning, 24, 49-64
168 | (1996). [2] J. Sill1 et al, "Feature Weighted Linear Stacking",
169 | https://arxiv.org/abs/0911.0460, 2009.
170 | 
171 | .. |Build Status| image:: https://travis-ci.org/fukatani/stacked_generalization.svg?branch=master
172 |    :target: https://travis-ci.org/fukatani/stacked_generalization
173 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | from setuptools import setup, find_packages
 4 | 
 5 | version = '0.0.6'
 6 | 
 7 | install_requires = [
 8 |     'numpy',
 9 |     'scikit-learn',
10 |     'pandas',
11 | ]
12 | 
13 | CURRENT_DIR = os.path.abspath(os.path.dirname(__file__))
14 | 
15 | def read(filename):
16 |     return io.open(os.path.join(CURRENT_DIR, filename), encoding='utf-8').read()
17 | 
18 | setup(name='stacked_generalization',
19 |       version=version,
20 |       description='Machine Learning Stacking Util',
21 |       keywords = 'Stacking, Machine Learning',
22 |       author='Ryosuke Fukatani',
23 |       author_email='nannyakannya@gmail.com',
24 |       url='https://github.com/fukatani/stacked_generalization',
25 |       license="Apache License 2.0",
26 |       packages=find_packages(),
27 |       package_data={ 'stacked_generalization' : ['Readme.md'], },
28 |       long_description='Readme.rst',
29 |       install_requires=install_requires,
30 | )
31 | 
32 | 


--------------------------------------------------------------------------------
/stacked_generalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fukatani/stacked_generalization/bd1e7aa7f090e6902cfbd389f9cd6500d7389954/stacked_generalization/__init__.py


--------------------------------------------------------------------------------
/stacked_generalization/example/cross_validation_for_iris.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets
 2 | from sklearn.utils.validation import check_random_state
 3 | from stacked_generalization.lib.stacking import StackedClassifier
 4 | from sklearn.ensemble import RandomForestClassifier
 5 | from sklearn.ensemble import ExtraTreesClassifier
 6 | from sklearn.ensemble import GradientBoostingClassifier
 7 | from sklearn.neighbors import KNeighborsClassifier
 8 | from sklearn.linear_model import LogisticRegression, RidgeClassifier
 9 | from sklearn.linear_model import Ridge
10 | from sklearn.model_selection import StratifiedKFold
11 | from sklearn.manifold import TSNE
12 | 
13 | iris = datasets.load_iris()
14 | rng = check_random_state(0)
15 | perm = rng.permutation(iris.target.size)
16 | iris.data = iris.data[perm]
17 | iris.target = iris.target[perm]
18 | 
19 | # Stage 1 model
20 | bclf = LogisticRegression(random_state=1)
21 | 
22 | # Stage 0 models
23 | clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
24 |         ExtraTreesClassifier(n_estimators=30, criterion = 'gini', random_state=3),
25 |         GradientBoostingClassifier(n_estimators=25, random_state=1),
26 |         GradientBoostingClassifier(n_estimators=30, random_state=2),
27 |         #GradientBoostingClassifier(n_estimators=30, random_state=3),
28 |         KNeighborsClassifier(),
29 |         RidgeClassifier(random_state=1),
30 |         Ridge(),
31 |         TSNE(n_components=2)
32 |         ]
33 | 
34 | sc = StackedClassifier(bclf,
35 |                        clfs,
36 |                        n_folds=3,
37 |                        verbose=0,
38 |                        stack_by_proba=True,
39 |                        oob_score_flag=True,
40 |                        )
41 | 
42 | gb = GradientBoostingClassifier(n_estimators=25, random_state=1)
43 | 
44 | # cross validation
45 | sc_score = 0
46 | gb_score = 0
47 | n_folds = 3
48 | for train_idx, test_idx in StratifiedKFold(n_folds).split(iris.data, iris.target):
49 |     xs_train = iris.data[train_idx]
50 |     y_train = iris.target[train_idx]
51 |     xs_test = iris.data[test_idx]
52 |     y_test = iris.target[test_idx]
53 | 
54 |     sc.fit(xs_train, y_train)
55 |     print('oob_score: {0}'.format(sc.oob_score_))
56 |     sc_score += sc.score(xs_test, y_test)
57 |     gb.fit(xs_train, y_train)
58 |     gb_score += gb.score(xs_test, y_test)
59 | 
60 | sc_score /= n_folds
61 | print('Stacked Classfier score: {0}'.format(sc_score))
62 | gb_score /= n_folds
63 | print('Gradient Boosting Classfier score: {0}'.format(gb_score))
64 | 


--------------------------------------------------------------------------------
/stacked_generalization/example/fwls_regression.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets, metrics, preprocessing
 2 | from stacked_generalization.lib.stacking import FWLSRegressor
 3 | from sklearn.ensemble import RandomForestRegressor
 4 | from sklearn.ensemble import GradientBoostingRegressor
 5 | from sklearn.ensemble import ExtraTreesRegressor
 6 | from sklearn.linear_model import LinearRegression, Ridge
 7 | import numpy as np
 8 | 
 9 | 
10 | boston = datasets.load_boston()
11 | X = preprocessing.StandardScaler().fit_transform(boston.data)
12 | Y = boston.target
13 | 
14 | X_train = X[:200]
15 | Y_train = Y[:200]
16 | X_test = X[200:]
17 | Y_test = Y[200:]
18 | 
19 | breg = LinearRegression()
20 | regs = [RandomForestRegressor(n_estimators=50, random_state=1),
21 |         GradientBoostingRegressor(n_estimators=25, random_state=1),
22 |         Ridge(),
23 |         ExtraTreesRegressor(n_estimators=50),
24 |         ]
25 | feature_func = lambda x: np.c_[np.ones((x.shape[0], 1)),
26 |                                x[:, 1].reshape((x.shape[0], 1)),
27 |                                x[:, 6].reshape((x.shape[0], 1)),]
28 | 
29 | sr = FWLSRegressor(breg,
30 |                    regs,
31 |                    feature_func,
32 |                    n_folds=3,
33 |                    verbose=0,
34 |                    oob_score_flag=False)
35 | 
36 | sr.fit(X_train, Y_train)
37 | score = metrics.mean_squared_error(sr.predict(X_test), Y_test)
38 | print ("MSE of stacked regressor: %f" % score)
39 | 


--------------------------------------------------------------------------------
/stacked_generalization/example/joblibed_classification.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets
 2 | from sklearn.model_selection import StratifiedKFold
 3 | from sklearn.ensemble import RandomForestClassifier
 4 | from sklearn.utils.validation import check_random_state
 5 | from stacked_generalization.lib.joblibed import JoblibedClassifier
 6 | 
 7 | 
 8 | iris = datasets.load_iris()
 9 | rng = check_random_state(0)
10 | perm = rng.permutation(iris.target.size)
11 | iris.data = iris.data[perm]
12 | iris.target = iris.target[perm]
13 | 
14 | # Joblibed model
15 | rf = RandomForestClassifier(n_estimators=40,
16 |                             criterion='gini',
17 |                             random_state=1)
18 | clf = JoblibedClassifier(rf, "rf")
19 | 
20 | 
21 | train_idx, test_idx = list(StratifiedKFold(3).split(iris.data, iris.target))[0]
22 | 
23 | xs_train = iris.data[train_idx]
24 | y_train = iris.target[train_idx]
25 | xs_test = iris.data[test_idx]
26 | y_test = iris.target[test_idx]
27 | 
28 | 
29 | print("First fit and prediction (not cached).")
30 | clf.fit(xs_train, y_train, train_idx)
31 | score = clf.score(xs_test, y_test, test_idx)
32 | print('Classfier score: {0}'.format(score))
33 | 
34 | print("Second fit and prediction (load cache).")
35 | clf.fit(xs_train, y_train, train_idx)
36 | score = clf.score(xs_test, y_test, test_idx)
37 | print('Classfier score: {0}'.format(score))
38 | 


--------------------------------------------------------------------------------
/stacked_generalization/example/kaggle_titanic.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import StratifiedKFold
  2 | from sklearn import preprocessing
  3 | from sklearn.linear_model import LogisticRegression, Ridge
  4 | from sklearn.preprocessing import LabelBinarizer
  5 | from stacked_generalization.lib.stacking import StackedClassifier
  6 | from sklearn.ensemble import RandomForestClassifier
  7 | from sklearn.ensemble import ExtraTreesClassifier
  8 | from sklearn.ensemble import GradientBoostingClassifier
  9 | from sklearn.neighbors import KNeighborsClassifier
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | import re
 14 | 
 15 | class DataReader(object):
 16 |     def __init__(self, file_name):
 17 |         self.file_name = file_name
 18 | 
 19 |     def disp_hist(self, data, label, bins):
 20 |         temp = [i[label].dropna() for i in data]
 21 |         plt.hist(temp, histtype='barstacked', bins=bins)
 22 |         plt.show()
 23 | 
 24 |     def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True):
 25 |         def get_title(name):
 26 |             title_search = re.search(' ([A-Za-z]+)\.', name)
 27 |             if title_search:
 28 |                 return title_search.group(1)
 29 |             return ""
 30 | 
 31 |         def normalize_fare(data):
 32 |             new_data = None
 33 |             for embarked in (0, 1, 2):
 34 |                 temp = data[data.Embarked == embarked]
 35 |                 temp['Fare'] /= temp['Fare'].values.mean()
 36 |                 if new_data is None:
 37 |                     new_data = temp
 38 |                 else:
 39 |                     new_data = pd.concat([new_data, temp])
 40 |             new_data = new_data.sort('PassengerId')
 41 |             return new_data
 42 | 
 43 |         data = pd.read_csv(self.file_name).replace('male',0).replace('female',1)
 44 |         data['Age'].fillna(data.Age.median(), inplace=True)
 45 |         data['Fare'].fillna(data.Fare.median(), inplace=True)
 46 |         data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
 47 |         data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2)
 48 |         data['Embarked'].fillna(0, inplace=True)
 49 |         if norm_fare:
 50 |             data = normalize_fare(data)
 51 | 
 52 |         # Get all the titles and print how often each one occurs.
 53 |         titles = data["Name"].apply(get_title)
 54 |         print(pd.value_counts(titles))
 55 | 
 56 |         # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
 57 |         title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
 58 |         for k,v in title_mapping.items():
 59 |             titles[titles == k] = v
 60 | 
 61 |         # Add in the title column.
 62 |         data['Title'] = titles
 63 |         data['Title'].fillna(1, inplace=True)
 64 |         #data['Pos'] = data["Title"] + data['Pclass']
 65 |         if drop:
 66 |             #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1)
 67 |             data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
 68 |             #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1)
 69 |         print(data.keys())
 70 |         if title_to_onehot:
 71 |             self.encode(data, 'Title', [i for i in range(1, 11)])
 72 |             data = data.drop(['Title'], axis=1)
 73 |         return data
 74 | 
 75 |     def encode(self, data, label, value_set=None):
 76 |         le =LabelBinarizer()
 77 |         if value_set is None:
 78 |             encoded = le.fit_transform(data[label])
 79 |         else:
 80 |             le.fit(value_set)
 81 |             encoded = le.transform(data[label])
 82 |         for i in range(encoded.shape[1]):
 83 |             new_label = '{0}_is_{1}'.format(label, i)
 84 |             data[new_label] = encoded[:,i]
 85 | 
 86 |     def split_by_label(self, data, label='Survived'):
 87 |         split_data = []
 88 |         for element in set(data[label]):
 89 |             split_data.append(data[data[label]==element])
 90 |         return split_data
 91 | 
 92 |     def get_sample(self, N=600, scale=False):
 93 |         all_data = self.pre_process(self.file_name)
 94 |         #print('data_type: ' + str(all_data.dtypes))
 95 |         all_data = all_data.values
 96 |         xs = all_data[:, 2:]
 97 |         y = all_data[:, 1]
 98 |         if scale:
 99 |             xs = preprocessing.scale(xs)
100 |         if N != -1:
101 |             perm = np.random.permutation(xs.shape[0])
102 |             xs = xs[perm]
103 |             y = y[perm]
104 |             xs_train, xs_test = np.split(xs, [N])
105 |             y_train, y_test = np.split(y, [N])
106 |             return xs_train, xs_test, y_train, y_test
107 |         else:
108 |             return xs, y
109 | 
110 |     def summarize_about_same_ticket(self, data):
111 |         data = data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Cabin', 'FamilySize'], axis=1)
112 |         for num in data[data.Age <= 5.0]['Ticket']:
113 |             print('num:' + num)
114 |             print(data[data.Ticket == num])
115 | 
116 | 
117 | class TestDataReader(DataReader):
118 |     def get_sample(self, N=-1):
119 |         all_data = self.pre_process(self.file_name)
120 |         all_data = all_data.values
121 |         xs = all_data[:, 1:]
122 |         pid = all_data[:, 0]
123 |         return pid, xs
124 | 
125 | def write_result(pid, output, suffix=''):
126 |     import csv
127 |     import datetime
128 |     suffix += datetime.datetime.today().strftime("%Y-%m-%d-%H-%M-%S")
129 |     with open("predict_result_data_{0}.csv".format(suffix), "w") as f:
130 |         writer = csv.writer(f, lineterminator='\n')
131 |         writer.writerow(["PassengerId", "Survived"])
132 |         for pid, survived in zip(pid.astype(int), output.astype(int)):
133 |             writer.writerow([pid, survived])
134 | 
135 | if __name__ == '__main__':
136 |     import os
137 |     if not os.path.isfile('train.csv'):
138 |         raise Exception('This example is data analysis for Kaggle Titanic Competition.' +
139 |                         'For trying this example, you should download "train.csv" from https://www.kaggle.com/c/titanic.')
140 | 
141 |     train = True
142 |     full_cv = True
143 |     test = False
144 | 
145 |     train_dr = DataReader('train.csv')
146 |     bclf = LogisticRegression(random_state=1)
147 |     clfs = [
148 |             RandomForestClassifier(n_estimators=50, criterion = 'gini', random_state=1),
149 |             ExtraTreesClassifier(n_estimators=50, criterion = 'gini', random_state=1),
150 |             ExtraTreesClassifier(n_estimators=50, criterion = 'gini', random_state=2),
151 |             GradientBoostingClassifier(n_estimators=25, random_state=1),
152 |             GradientBoostingClassifier(n_estimators=40, random_state=1),
153 |             Ridge(random_state=1),
154 |             #KNeighborsClassifier(n_neighbors=4)
155 |             #LogisticRegression(random_state=1)
156 |             ]
157 |     sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=2)
158 |     #fsl = FWSLClassifier(bclf, clfs, feature=xs_train[:, 0])
159 |     if train:# evalute by hold-out and out-of-bugs
160 |         sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=2, oob_score_flag=True)
161 |         xs_train, xs_test, y_train, y_test = train_dr.get_sample()
162 |         sl.fit(xs_train, y_train)
163 |         score = sl.score(xs_test, y_test)
164 |         print('score: {0}'.format(score))
165 |         print('oob_score: {0}'.format(sl.oob_score_))
166 |     if full_cv: #cross validation
167 |         sl = StackedClassifier(bclf, clfs, oob_score_flag=False,verbose=2)
168 |         xs_train, y_train = train_dr.get_sample(-1)
169 |         score = []
170 |         for train_index, test_index in StratifiedKFold(3).split(xs_train, y_train):
171 |             sl.fit(xs_train[train_index], y_train[train_index])
172 |             score.append(sl.score(xs_train[test_index], y_train[test_index]))
173 |         print('full-cv score: {0}'.format(score))
174 |     if test: #to make pb leader board data.
175 |         xs_train, y_train = train_dr.get_sample(-1)
176 |         sl.fit(xs_train, y_train)
177 |         test_dr = TestDataReader('test.csv')
178 |         pid, xs_test = test_dr.get_sample(-1)
179 |         output = sl.predict(xs_test)
180 |         write_result(pid, output, sl.tostr())


--------------------------------------------------------------------------------
/stacked_generalization/example/simple_regression.py:
--------------------------------------------------------------------------------
 1 | from sklearn import datasets, metrics, preprocessing
 2 | from stacked_generalization.lib.stacking import StackedRegressor
 3 | from sklearn.ensemble import RandomForestRegressor
 4 | from sklearn.ensemble import GradientBoostingRegressor
 5 | from sklearn.ensemble import ExtraTreesRegressor
 6 | from sklearn.linear_model import LinearRegression, Ridge
 7 | from sklearn.manifold import TSNE
 8 | 
 9 | 
10 | boston = datasets.load_boston()
11 | X = preprocessing.StandardScaler().fit_transform(boston.data)
12 | Y = boston.target
13 | 
14 | X_train = X[:200]
15 | Y_train = Y[:200]
16 | X_test = X[200:]
17 | Y_test = Y[200:]
18 | 
19 | breg = LinearRegression()
20 | regs = [RandomForestRegressor(n_estimators=50, random_state=1),
21 |         GradientBoostingRegressor(n_estimators=25, random_state=1),
22 |         GradientBoostingRegressor(n_estimators=30, random_state=2),
23 |         Ridge(),
24 |         ExtraTreesRegressor(n_estimators=50),
25 |         TSNE(n_components=2)
26 |         ]
27 | 
28 | sr = StackedRegressor(breg,
29 |                       regs,
30 |                       n_folds=3,
31 |                       verbose=0,
32 |                       oob_score_flag=False)
33 | sr.fit(X_train, Y_train)
34 | score = metrics.mean_squared_error(sr.predict(X_test), Y_test)
35 | print ("MSE of stacked regressor: %f" % score)
36 | #print ("OOB of stacked regressor: %f" % sr.oob_score_)
37 | 
38 | gb = GradientBoostingRegressor(n_estimators=25, random_state=1)
39 | gb.fit(X_train, Y_train)
40 | score = metrics.mean_squared_error(gb.predict(X_test), Y_test)
41 | print ("MSE of gradient boosting regressor: %f" % score)


--------------------------------------------------------------------------------
/stacked_generalization/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fukatani/stacked_generalization/bd1e7aa7f090e6902cfbd389f9cd6500d7389954/stacked_generalization/lib/__init__.py


--------------------------------------------------------------------------------
/stacked_generalization/lib/joblibed.py:
--------------------------------------------------------------------------------
  1 | ﻿import numpy as np
  2 | from sklearn.base import BaseEstimator, ClassifierMixin, clone, RegressorMixin
  3 | from sklearn.externals import joblib
  4 | import stacked_generalization.lib.util as util
  5 | import os
  6 | 
  7 | 
  8 | class BaseJoblibed(BaseEstimator):
  9 |     """Base class for joblibed learner.
 10 | 
 11 |     Warning: This class should not be used directly. Use derived classes
 12 |     instead.
 13 |     """
 14 |     def __init__(self,
 15 |                  estimator,
 16 |                  prefix,
 17 |                  skip_refit=True,
 18 |                  cache_dir='temp/'):
 19 |         self.estimator = estimator
 20 |         self.prefix = prefix
 21 |         self.estimator.id = 'j' + prefix
 22 |         self.skip_refit = skip_refit
 23 |         self.cache_dir = cache_dir
 24 |         if self.cache_dir and not os.path.isdir(self.cache_dir):
 25 |             os.mkdir(self.cache_dir)
 26 | 
 27 |     def fit(self, xs_train, y_train, index=None):
 28 |         dump_file = ""
 29 |         if index is not None:
 30 |             dump_file = util.get_cache_file(self.estimator.id,
 31 |                                             index,
 32 |                                             cache_dir=self.cache_dir,
 33 |                                             suffix='pkl')
 34 |         if self.skip_refit and os.path.isfile(dump_file):
 35 |             if index is not None:
 36 |                 self.estimator = joblib.load(dump_file)
 37 |         else:
 38 |             self.estimator.fit(xs_train, y_train)
 39 |             if index is not None:
 40 |                 joblib.dump(self.estimator, dump_file, compress=True)
 41 |         return self
 42 | 
 43 | 
 44 | class JoblibedClassifier(BaseJoblibed, ClassifierMixin):
 45 |     """A joblibed classifier.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     estimator : cache target model.
 50 |     prefix : file prefix.
 51 | 
 52 |     """
 53 |     def predict_proba(self, xs_test, index=None):
 54 |         """Predict class probabilities for X.
 55 | 
 56 |         The predicted class probabilities of an input sample is computed.
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         X : array-like or sparse matrix of shape = [n_samples, n_features]
 61 |             The input samples.
 62 | 
 63 |         Returns
 64 |         -------
 65 |         p : array of shape = [n_samples, n_classes].
 66 |             The class probabilities of the input samples.
 67 |         """
 68 |         return util.saving_predict_proba(self.estimator,
 69 |                                          xs_test,
 70 |                                          index,
 71 |                                          self.cache_dir)
 72 | 
 73 |     def predict(self, X, index=None):
 74 |         """Predict class for X.
 75 | 
 76 |         The predicted class of an input sample is a vote by the JoblibedClassifier.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         X : array-like or sparse matrix of shape = [n_samples, n_features]
 81 |             The input samples. Internally, it will be converted to
 82 |             ``dtype=np.float32`` and if a sparse matrix is provided
 83 |             to a sparse ``csr_matrix``.
 84 | 
 85 |         Returns
 86 |         -------
 87 |         y : array of shape = [n_samples]
 88 |             The predicted classes.
 89 |         """
 90 |         proba = self.predict_proba(X, index)
 91 |         return np.argmax(proba, axis=1)
 92 | 
 93 |     def score(self, X, y, index=None, sample_weight=None):
 94 |         from sklearn.metrics import accuracy_score
 95 |         return accuracy_score(y,
 96 |                               self.predict(X, index),
 97 |                               sample_weight=sample_weight)
 98 | 
 99 | 
100 | class JoblibedRegressor(BaseJoblibed, RegressorMixin):
101 |     """A joblibed regressor.
102 | 
103 |     Parameters
104 |     ----------
105 |     estimator : cache target model.
106 |     prefix : file prefix.
107 | 
108 |     """
109 |     def predict(self, xs_test, index=None):
110 |         return util.saving_predict(self.estimator,
111 |                                    xs_test,
112 |                                    index,
113 |                                    self.cache_dir)
114 | 


--------------------------------------------------------------------------------
/stacked_generalization/lib/stacking.py:
--------------------------------------------------------------------------------
  1 | ﻿import numpy as np
  2 | from sklearn.model_selection import StratifiedKFold, KFold
  3 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
  4 | from stacked_generalization.lib.util import numpy_c_concatenate
  5 | from stacked_generalization.lib.util import multiple_feature_weight
  6 | from sklearn.metrics import accuracy_score
  7 | from sklearn.metrics import mean_squared_error
  8 | from collections import OrderedDict
  9 | from sklearn.preprocessing import LabelBinarizer
 10 | from sklearn.externals import joblib
 11 | from stacked_generalization.lib import util
 12 | import os
 13 | 
 14 | 
 15 | class BaseStacked(BaseEstimator):
 16 |     """Base class for joblibed learner.
 17 | 
 18 |     Warning: This class should not be used directly. Use derived classes
 19 |     instead.
 20 |     """
 21 |     def _fit_child(self, skf, xs_train, y_train):
 22 |         """Build stage0 models from the training set (xs_train, y_train).
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         skf: StratifiedKFold-like iterator
 27 |             Use for cross validation blending.
 28 | 
 29 |         xs_train : array-like or sparse matrix of shape = [n_samples, n_features]
 30 |             The training input samples.
 31 | 
 32 |         y_train : array-like, shape = [n_samples]
 33 |             The target values (class labels in classification).
 34 | 
 35 |         Returns
 36 |         -------
 37 |         blend_train : array-like, shape = [n_samples]
 38 |             For stage1 model training.
 39 |         blend_test : array-like, shape = [n_samples]
 40 |             If you use TwoStageKFold, blended sample for test will be prepared.
 41 |         """
 42 |         blend_train = None
 43 |         blend_test = None
 44 |         for j, clf in enumerate(self.clfs):
 45 |             self._out_to_console('Training classifier [{0}]'.format(j), 0)
 46 |             all_learner_key = str(type(clf)) + str(j)
 47 |             self.all_learner[all_learner_key] = []
 48 |             blend_train_j = None
 49 |             for i, (train_index, cv_index) in enumerate(skf):
 50 |                 now_learner = clone(clf)
 51 |                 self.all_learner[all_learner_key].append(now_learner)
 52 |                 xs_now_train = xs_train[train_index]
 53 |                 y_now_train = y_train[train_index]
 54 |                 xs_cv = xs_train[cv_index]
 55 |                 #y_cv = y_train[cv_index] no use
 56 | 
 57 |                 if not hasattr(now_learner, 'id'):
 58 |                     now_learner.id = self.get_stage0_id(now_learner)
 59 | 
 60 |                 dump_file = util.get_cache_file(now_learner.id,
 61 |                                                 cv_index,
 62 |                                                 suffix='pkl')
 63 |                 if self.save_stage0 and self._is_saved(now_learner, cv_index):
 64 |                     print('Prediction cache exists: skip fitting.')
 65 |                     now_learner = joblib.load(dump_file)
 66 |                     self.all_learner[all_learner_key][-1] = now_learner
 67 |                 else:
 68 |                     self._out_to_console('Fold [{0}]'.format(i), 0)
 69 |                     now_learner.fit(xs_now_train, y_now_train)
 70 |                     if self.save_stage0:
 71 |                         joblib.dump(now_learner, dump_file, compress=True)
 72 | 
 73 |                 if blend_train_j is None:
 74 |                     blend_train_j = self._get_blend_init(y_train, now_learner)
 75 |                 blend_train_j[cv_index] = self._get_child_predict(now_learner, xs_cv, cv_index)
 76 |             blend_train = numpy_c_concatenate(blend_train, blend_train_j)
 77 |         return blend_train, blend_test
 78 | 
 79 |     def fit(self, xs_train, y_train):
 80 |         """Build a stacked classfier from the training set (xs_train, y_train).
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         xs_train : array-like or sparse matrix of shape = [n_samples, n_features]
 85 |             The training input samples.
 86 | 
 87 |         y_train : array-like, shape = [n_samples]
 88 |             The target values (class labels in classification).
 89 | 
 90 |         Returns
 91 |         -------
 92 |         self : object
 93 |             Returns self.
 94 |         """
 95 |         self.n_classes_ = np.unique(y_train).shape[0]
 96 | 
 97 |         # Ready for cross validation
 98 |         skf = self._make_kfold(xs_train, y_train)
 99 |         self._out_to_console('xs_train.shape = {0}'.format(xs_train.shape), 1)
100 | 
101 |         #fit stage0 models.
102 |         blend_train, _ = self._fit_child(skf, xs_train, y_train)
103 |         blend_train = self._pre_propcess(blend_train, xs_train)
104 | 
105 |         #calc out of bugs score
106 |         if self.oob_score_flag:
107 |             self.calc_oob_score(blend_train, y_train, skf)
108 | 
109 |         # blending
110 |         self._out_to_csv('blend_train', blend_train, 2)
111 |         self._out_to_csv('y_train', y_train, 2)
112 |         self.bclf.fit(blend_train, y_train)
113 | 
114 |         self._out_to_console('xs_train.shape = {0}'.format(xs_train.shape), 1)
115 |         self._out_to_console('blend_train.shape = {0}'.format(blend_train.shape), 1)
116 | 
117 |         return self
118 | 
119 |     def _is_saved(self, model, index):
120 |         model_id = self.get_stage0_id(model)
121 |         return os.path.isfile(util.get_cache_file(model_id, index))
122 | 
123 |     def _make_blend_test(self, xs_test, index=None):
124 |         """Make blend sample for test.
125 | 
126 |         Parameters
127 |         ----------
128 |         xs_test : array-like or sparse matrix of shape = [n_samples, n_features]
129 |             The input samples.
130 | 
131 |         Returns
132 |         -------
133 |         blend_test : array of shape = [n_samples, n_stage0_models].
134 |             Calc as the mean of the predictions of the cross validation set.
135 |         """
136 |         blend_test = None
137 |         for clfs in self.all_learner.values():
138 |             blend_test_j = None
139 |             for clf in clfs:
140 |                 blend_test_j_temp = self._get_child_predict(clf, xs_test, index)
141 |                 if blend_test_j is None:
142 |                     blend_test_j = blend_test_j_temp
143 |                 else:
144 |                     blend_test_j += blend_test_j_temp
145 |             blend_test_j = blend_test_j / len(clfs) #convert to mean
146 |             blend_test = numpy_c_concatenate(blend_test, blend_test_j)
147 |         return blend_test
148 | 
149 |     def _get_child_predict(self, clf, X, index=None):
150 |         if self.stack_by_proba and hasattr(clf, 'predict_proba'):
151 |             if self.save_stage0 and index is not None:
152 |                 proba = util.saving_predict_proba(clf, X, index)
153 |             else:
154 |                 proba = clf.predict_proba(X)
155 |             return proba[:, 1:]
156 |         elif hasattr(clf, 'predict'):
157 |             predict_result = clf.predict(X)
158 |             if isinstance(clf, ClassifierMixin):
159 |                 lb = LabelBinarizer()
160 |                 lb.fit(predict_result)
161 |                 return lb.fit_transform(predict_result)
162 |             else:
163 |                 return predict_result.reshape((predict_result.size, 1))
164 |         else:
165 |             return clf.fit_transform(X)
166 | 
167 |     def _get_blend_init(self, y_train, clf):
168 |         if self.stack_by_proba and hasattr(clf, 'predict_proba'):
169 |             width = self.n_classes_ - 1
170 |         elif hasattr(clf, 'predict') and isinstance(clf, ClassifierMixin):
171 |             width = self.n_classes_
172 |         elif hasattr(clf, 'predict'):
173 |             width = 1
174 |         elif hasattr(clf, 'n_components'):
175 |             width = clf.n_components
176 |         else:
177 |             raise Exception('Unimplemented for {0}'.format(type(clf)))
178 |         return np.zeros((y_train.size, width))
179 | 
180 | 
181 |     def _out_to_console(self, message, limit_verbose):
182 |         if self.verbose > limit_verbose:
183 |             print(message)
184 | 
185 |     def _out_to_csv(self, file_name, data, limit_verbose):
186 |         """write_out numpy array to csv"""
187 |         import os
188 |         file_name = 'data/{0}.csv'.format(file_name)
189 |         if self.verbose > limit_verbose:
190 |             while True:
191 |                 if os.path.isfile(file_name):
192 |                     file_name = file_name.replace('.csv', '_.csv')
193 |                 else:
194 |                     break
195 |             np.savetxt(file_name, data, delimiter=",")
196 | 
197 |     def _pre_propcess(self, blend, X):
198 |         return numpy_c_concatenate(blend, X)
199 | 
200 |     def get_stage0_id(self, model):
201 |         return self.save_dir + util.get_model_id(model)
202 | 
203 |     def calc_oob_score(self, blend_train, y_train, skf):
204 |         """Compute out-of-bag score"""
205 |         if self.oob_metrics.__name__ == 'log_loss':
206 |             y_predict = np.zeros((y_train.size, self.n_classes_))
207 |         else:
208 |             y_predict = np.zeros(y_train.shape)
209 |         for train_index, cv_index in skf:
210 |             self.bclf.fit(blend_train[train_index], y_train[train_index])
211 |             if self.oob_metrics.__name__ == 'log_loss':
212 |                 y_predict[cv_index] = self.bclf.predict_proba(blend_train[cv_index])
213 |             else:
214 |                 y_predict[cv_index] = self.bclf.predict(blend_train[cv_index])
215 |         self.oob_score_ = self.oob_metrics(y_train, y_predict)
216 |         self._out_to_console('oob_score: {0}'.format(self.oob_score_), 0)
217 | 
218 | 
219 | class StackedClassifier(BaseStacked, ClassifierMixin):
220 |     """A stacking classifier.
221 | 
222 |     Parameters
223 |     ----------
224 |     bclf : stage1 model for stacking.
225 | 
226 |     clfs : list of stage0 machine learning models.
227 | 
228 |     n_folds : integer
229 |      Number of folds at stage0 blending.
230 | 
231 |     Kfold: scikit-learn KFold like
232 |         If Any Kfold is assigned, it will be used in blending.
233 | 
234 |     stack_by_proba : boolean
235 |         If True and stage0 machine learning model has 'predict_proba',
236 |         result of it is used in blending.
237 |         If not, result of 'predict' is used in blending.
238 | 
239 |     oob_score_flag : boolean
240 |         If True, stacked clssfier calc out-of-bugs score after fitting.
241 |         You can evaluate model by this score (with out CV).
242 | 
243 |     oob_metrics : metrics for evaluation oob.
244 | 
245 |     verbose : int, optional (default=0)
246 |         Controls the verbosity of the tree building process.
247 | 
248 |     .. [1] L. Breiman, "Stacked Regressions", Machine Learning, 24, 49-64 (1996).
249 |     """
250 |     def __init__(self,
251 |                  bclf,
252 |                  clfs,
253 |                  n_folds=3,
254 |                  stack_by_proba=True,
255 |                  oob_score_flag=False,
256 |                  oob_metrics=accuracy_score,
257 |                  Kfold=None,
258 |                  verbose=0,
259 |                  save_stage0=False,
260 |                  save_dir=''):
261 |         self.n_folds = n_folds
262 |         self.clfs = clfs
263 |         self.bclf = bclf
264 |         self.stack_by_proba = stack_by_proba
265 |         self.all_learner = OrderedDict()
266 |         self.oob_score_flag = oob_score_flag
267 |         self.oob_metrics = oob_metrics
268 |         self.verbose = verbose
269 |         self.MyKfold = Kfold
270 |         self.save_stage0 = save_stage0
271 |         self.save_dir = save_dir
272 |         for clf in clfs:
273 |             if not hasattr(clf, 'id'):
274 |                 clf.id = self.save_dir + util.get_model_id(clf)
275 | 
276 |     def predict_proba(self, xs_test, index=None):
277 |         """Predict class probabilities for X.
278 | 
279 |         The predicted class probabilities of an input sample is computed.
280 | 
281 |         Parameters
282 |         ----------
283 |         X : array-like or sparse matrix of shape = [n_samples, n_features]
284 |             The input samples.
285 | 
286 |         Returns
287 |         -------
288 |         p : array of shape = [n_samples, n_classes].
289 |             The class probabilities of the input samples.
290 |         """
291 |         blend_test = self._make_blend_test(xs_test, index)
292 |         blend_test = self._pre_propcess(blend_test, xs_test)
293 |         return self.bclf.predict_proba(blend_test)
294 | 
295 |     def _make_kfold(self, X, Y):
296 |         if self.MyKfold is not None:
297 |             return self.MyKfold
298 |         else:
299 |             return list(StratifiedKFold(self.n_folds).split(X, Y))
300 | 
301 |     def predict(self, X, index=None):
302 |         """Predict class for X.
303 | 
304 |         The predicted class of an input sample is a vote by the StackedClassifier.
305 | 
306 |         Parameters
307 |         ----------
308 |         X : array-like or sparse matrix of shape = [n_samples, n_features]
309 |             The input samples. Internally, it will be converted to
310 |             ``dtype=np.float32`` and if a sparse matrix is provided
311 |             to a sparse ``csr_matrix``.
312 | 
313 |         Returns
314 |         -------
315 |         y : array of shape = [n_samples]
316 |             The predicted classes.
317 |         """
318 |         proba = self.predict_proba(X, index)
319 |         return np.argmax(proba, axis=1)
320 | 
321 | 
322 | class StackedRegressor(BaseStacked, RegressorMixin):
323 |     def __init__(self,
324 |                  bclf,
325 |                  clfs,
326 |                  n_folds=3,
327 |                  oob_score_flag=False,
328 |                  oob_metrics=mean_squared_error,
329 |                  Kfold=None,
330 |                  verbose=0,
331 |                  save_stage0=False,
332 |                  save_dir=''):
333 |         self.n_folds = n_folds
334 |         self.clfs = clfs
335 |         self.bclf = bclf
336 |         self.all_learner = OrderedDict()
337 |         self.oob_score_flag = oob_score_flag
338 |         self.oob_metrics = oob_metrics
339 |         self.verbose = verbose
340 |         self.stack_by_proba = False
341 |         self.save_stage0 = save_stage0
342 |         self.save_dir = save_dir
343 |         self.MyKfold = Kfold
344 | 
345 |     def predict(self, X, index=None):
346 |         """
347 |         The predicted value of an input sample is a vote by the StackedRegressor.
348 | 
349 |         Parameters
350 |         ----------
351 |         X : array-like or sparse matrix of shape = [n_samples, n_features]
352 |             The input samples. Internally, it will be converted to
353 |             ``dtype=np.float32`` and if a sparse matrix is provided
354 |             to a sparse ``csr_matrix``.
355 | 
356 |         Returns
357 |         -------
358 |         y : array of shape = [n_samples]
359 |             The predicted values.
360 |         """
361 |         blend_test = self._make_blend_test(X, index)
362 |         blend_test = self._pre_propcess(blend_test, X)
363 |         return self.bclf.predict(blend_test)
364 | 
365 |     def _make_kfold(self, X, Y):
366 |         if self.MyKfold is not None:
367 |             return self.MyKfold
368 |         else:
369 |             return list(KFold(self.n_folds).split(X, Y))
370 | 
371 |     def _get_blend_init(self, y_train, clf):
372 |         if hasattr(clf, 'predict'):
373 |             width = 1
374 |         elif hasattr(clf, 'n_components'):
375 |             width = clf.n_components
376 |         return np.zeros((y_train.size, width))
377 | 
378 |     def _get_child_predict(self, clf, X, index=None):
379 |         if hasattr(clf, 'predict'):
380 |             if self.save_stage0 and index is not None:
381 |                 predict_result = util.saving_predict(clf, X, index)
382 |             else:
383 |                 predict_result = clf.predict(X)
384 |             return predict_result.reshape(predict_result.size, 1)
385 |         else:
386 |             return clf.fit_transform(X)
387 | 
388 | 
389 | class FWLSClassifier(StackedClassifier):
390 |     """
391 |     Feature Weighted Linear Stacking Classfier.
392 |     References
393 |     ----------
394 | 
395 |     .. [1] J. Sill1 et al, "Feature Weighted Linear Stacking", https://arxiv.org/abs/0911.0460, 2009.
396 |     """
397 |     def __init__(self,
398 |                  bclf,
399 |                  clfs,
400 |                  feature_func,
401 |                  n_folds=3,
402 |                  stack_by_proba=True,
403 |                  oob_score_flag=False,
404 |                  oob_metrics=accuracy_score,
405 |                  Kfold=None,
406 |                  verbose=0,
407 |                  save_stage0=False,
408 |                  save_dir=''):
409 |         super(FWLSClassifier, self).__init__(bclf,
410 |                                             clfs,
411 |                                             n_folds,
412 |                                             stack_by_proba,
413 |                                             oob_score_flag,
414 |                                             oob_metrics,
415 |                                             Kfold,
416 |                                             verbose,
417 |                                             save_stage0,
418 |                                             save_dir)
419 |         self.feature_func = feature_func
420 | 
421 |     def _pre_propcess(self, blend, X):
422 |         X = multiple_feature_weight(blend, self.feature_func(X))
423 |         return X
424 | 
425 | class FWLSRegressor(StackedRegressor):
426 |     """
427 |     Feature Weighted Linear Stacking Regressor.
428 |     References
429 |     ----------
430 | 
431 |     .. [1] J. Sill1 et al, "Feature Weighted Linear Stacking", https://arxiv.org/abs/0911.0460, 2009.
432 |     """
433 |     def __init__(self,
434 |                  bclf,
435 |                  clfs,
436 |                  feature_func,
437 |                  n_folds=3,
438 |                  oob_score_flag=False,
439 |                  oob_metrics=mean_squared_error,
440 |                  Kfold=None,
441 |                  verbose=0,
442 |                  save_stage0=False,
443 |                  save_dir=''):
444 |         super(FWLSRegressor, self).__init__(bclf,
445 |                                             clfs,
446 |                                             n_folds,
447 |                                             oob_score_flag,
448 |                                             oob_metrics,
449 |                                             Kfold,
450 |                                             verbose,
451 |                                             save_stage0,
452 |                                             save_dir)
453 | 
454 |         self.feature_func = feature_func
455 | 
456 |     def _pre_propcess(self, blend, X):
457 |         X = multiple_feature_weight(blend, self.feature_func(X))
458 |         return X
459 | 


--------------------------------------------------------------------------------
/stacked_generalization/lib/test/test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | 
  4 | from sklearn import datasets
  5 | from sklearn.utils.validation import check_random_state
  6 | from stacked_generalization.lib.stacking import StackedClassifier, FWLSClassifier
  7 | from stacked_generalization.lib.stacking import StackedRegressor, FWLSRegressor
  8 | from stacked_generalization.lib.joblibed import JoblibedClassifier, JoblibedRegressor
  9 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 10 | from sklearn.ensemble import ExtraTreesClassifier
 11 | from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.linear_model import RidgeClassifier
 14 | from sklearn.linear_model import Ridge
 15 | from sklearn.linear_model import LinearRegression
 16 | from sklearn.metrics import mean_squared_error, log_loss, accuracy_score
 17 | from sklearn.utils.testing import assert_less
 18 | import numpy as np
 19 | from stacked_generalization.lib.util import numpy_c_concatenate
 20 | from stacked_generalization.lib.util import saving_predict_proba
 21 | from stacked_generalization.lib.util import get_model_id
 22 | from stacked_generalization.lib.util import multiple_feature_weight
 23 | from sklearn.model_selection import StratifiedKFold
 24 | from numpy.testing import assert_allclose
 25 | import glob
 26 | 
 27 | 
 28 | class TestStackedClassfier(unittest.TestCase):
 29 |     def setUp(self):
 30 |         iris = datasets.load_iris()
 31 |         rng = check_random_state(0)
 32 |         perm = rng.permutation(iris.target.size)
 33 |         iris.data = iris.data[perm]
 34 |         iris.target = iris.target[perm]
 35 |         self.iris = iris
 36 | 
 37 |     def test_stacked_classfier_extkfold(self):
 38 |         bclf = LogisticRegression(random_state=1)
 39 |         clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
 40 |                 RidgeClassifier(random_state=1),
 41 |                 ]
 42 |         sl = StackedClassifier(bclf,
 43 |                                clfs,
 44 |                                n_folds=3,
 45 |                                verbose=0,
 46 |                                Kfold=list(StratifiedKFold(3).split(self.iris.data, self.iris.target)),
 47 |                                stack_by_proba=False,
 48 |                                oob_score_flag=True,
 49 |                                oob_metrics=log_loss)
 50 |         sl.fit(self.iris.data, self.iris.target)
 51 |         score = sl.score(self.iris.data, self.iris.target)
 52 |         self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
 53 | 
 54 |     def test_stacked_classfier(self):
 55 |         bclf = LogisticRegression(random_state=1)
 56 |         clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
 57 |                 ExtraTreesClassifier(n_estimators=30, criterion = 'gini', random_state=3),
 58 |                 GradientBoostingClassifier(n_estimators=25, random_state=1),
 59 |                 RidgeClassifier(random_state=1),
 60 |                 ]
 61 | 
 62 |         for n_folds, stack_by_proba in self.iter_for_stack_param():
 63 |             sl = StackedClassifier(bclf,
 64 |                                    clfs,
 65 |                                    n_folds=n_folds,
 66 |                                    verbose=0,
 67 |                                    stack_by_proba=stack_by_proba,
 68 |                                    oob_score_flag=True)
 69 |             sl.fit(self.iris.data, self.iris.target)
 70 |             score = sl.score(self.iris.data, self.iris.target)
 71 |             self.assertGreater(score, 0.8, "Failed with score = {0}".format(score))
 72 |             self.assertGreater(score, 0.8, "Failed with score = {0}".format(sl.oob_score_))
 73 |             print('oob_score: {0} @n_folds={1}, stack_by_proba={2}'
 74 |                   .format(sl.oob_score_, sl.n_folds, sl.stack_by_proba))
 75 | 
 76 |         for csv_file in glob.glob("*.csv"):
 77 |             os.remove(csv_file)
 78 |         for csv_file in glob.glob("*.pkl"):
 79 |             os.remove(csv_file)
 80 | 
 81 |         sl = StackedClassifier(bclf,
 82 |                                clfs,
 83 |                                oob_score_flag=True,
 84 |                                save_stage0=True)
 85 |         sl.fit(self.iris.data, self.iris.target)
 86 |         sl.score(self.iris.data, self.iris.target)
 87 |         self.assertGreater(score, 0.8, "Failed with score = {0}".format(score))
 88 |         sl.fit(self.iris.data, self.iris.target)
 89 |         sl.score(self.iris.data, self.iris.target)
 90 |         self.assertGreater(score, 0.8, "Failed with score = {0}".format(score))
 91 | 
 92 |         self.assertTrue(glob.glob('ExtraTreesClassifier_*.csv'))
 93 |         for csv_file in glob.glob("*.csv"):
 94 |             os.remove(csv_file)
 95 |         for csv_file in glob.glob("*.pkl"):
 96 |             os.remove(csv_file)
 97 | 
 98 |     def iter_for_stack_param(self):
 99 |         yield 2, True
100 |         yield 4, True
101 |         yield 2, False
102 |         yield 3, False
103 | 
104 |     def test_stacked_regressor(self):
105 |         bclf = LinearRegression()
106 |         clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
107 |                 GradientBoostingRegressor(n_estimators=25, random_state=1),
108 |                 Ridge(random_state=1)]
109 | 
110 |         # Friedman1
111 |         X, y = datasets.make_friedman1(n_samples=1200,
112 |                                        random_state=1,
113 |                                        noise=1.0)
114 |         X_train, y_train = X[:200], y[:200]
115 |         X_test, y_test = X[200:], y[200:]
116 | 
117 |         sr = StackedRegressor(bclf,
118 |                               clfs,
119 |                               n_folds=3,
120 |                               verbose=0,
121 |                               oob_score_flag=True)
122 |         sr.fit(X_train, y_train)
123 |         mse = mean_squared_error(y_test, sr.predict(X_test))
124 |         assert_less(mse, 6.0)
125 | 
126 |     def test_concatenate(self):
127 |         A = None
128 |         B = np.array([[1,2],[3,4]])
129 |         np.testing.assert_equal(numpy_c_concatenate(A, B), B)
130 |         A = np.array([[0], [1]])
131 |         np.testing.assert_equal(numpy_c_concatenate(A, B), [[0,1,2], [1,3,4]])
132 | 
133 |     def test_save_prediction(self):
134 |         model = RandomForestClassifier()
135 |         model.id = get_model_id(model)
136 |         model.fit(self.iris.data, self.iris.target)
137 |         indexes = np.fromfunction(lambda x: x, (self.iris.data.shape[0], ), dtype=np.int32)
138 |         saving_predict_proba(model, self.iris.data, indexes)
139 |         any_file_removed = False
140 |         for filename in os.listdir('.'):
141 |             if filename.startswith('RandomForestClassifier'):
142 |                 os.remove(filename)
143 |                 any_file_removed = True
144 |         self.assertTrue(any_file_removed)
145 | 
146 | 
147 |     def test_fwls_classfier(self):
148 |         feature_func = lambda x: np.ones(x.shape)
149 |         bclf = LogisticRegression(random_state=1)
150 |         clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1),
151 |                 RidgeClassifier(random_state=1),
152 |                 ]
153 |         sl = FWLSClassifier(bclf,
154 |                             clfs,
155 |                             feature_func=feature_func,
156 |                             n_folds=3,
157 |                             verbose=0,
158 |                             Kfold=list(StratifiedKFold(3).split(self.iris.data, self.iris.target)),
159 |                             stack_by_proba=False)
160 |         sl.fit(self.iris.data, self.iris.target)
161 |         score = sl.score(self.iris.data, self.iris.target)
162 |         self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
163 | 
164 |     def test_fwls_regressor(self):
165 |         feature_func = lambda x: np.ones(x.shape)
166 |         bclf = LinearRegression()
167 |         clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
168 |                 GradientBoostingRegressor(n_estimators=25, random_state=1),
169 |                 Ridge(random_state=1)]
170 | 
171 |         # Friedman1
172 |         X, y = datasets.make_friedman1(n_samples=1200,
173 |                                        random_state=1,
174 |                                        noise=1.0)
175 |         X_train, y_train = X[:200], y[:200]
176 |         X_test, y_test = X[200:], y[200:]
177 | 
178 |         sr = FWLSRegressor(bclf,
179 |                               clfs,
180 |                               feature_func,
181 |                               n_folds=3,
182 |                               verbose=0,
183 |                               oob_score_flag=True)
184 |         sr.fit(X_train, y_train)
185 |         mse = mean_squared_error(y_test, sr.predict(X_test))
186 |         assert_less(mse, 6.0)
187 | 
188 |     def test_multiple_feature_weight(self):
189 |         A = np.array([[1,2],[3,4],[5,6]])
190 |         B = np.array([[1],[1],[1]])
191 |         C = multiple_feature_weight(A, B)
192 |         np.testing.assert_equal(C, A)
193 |         B = np.array([[2],[2],[2]])
194 |         C = multiple_feature_weight(A, B)
195 |         np.testing.assert_equal(C, np.array([[2,4],[6,8],[10,12]]))
196 |         B = np.array([[1,2],[2,1],[1,2]])
197 |         C = multiple_feature_weight(A, B)
198 |         np.testing.assert_equal(C, np.array([[ 1,  2,  2,  4],
199 |                                              [ 6,  3,  8,  4],
200 |                                              [ 5, 10,  6, 12]]))
201 | 
202 | class TestJoblibedClassfier(unittest.TestCase):
203 |     def setUp(self):
204 |         iris = datasets.load_iris()
205 |         rng = check_random_state(0)
206 |         iris.data = iris.data
207 |         iris.target = iris.target
208 |         self.iris = iris
209 |         for csv_file in glob.glob("*.csv"):
210 |             os.remove(csv_file)
211 | 
212 |     def test_classifier(self):
213 |         index = [i for i in range(len(self.iris.data))]
214 | 
215 |         rf = RandomForestClassifier()
216 |         jrf = JoblibedClassifier(rf, "rf", cache_dir='')
217 |         jrf.fit(self.iris.data, self.iris.target, index)
218 |         prediction = jrf.predict(self.iris.data, index)
219 |         score = accuracy_score(self.iris.target, prediction)
220 |         self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
221 | 
222 |         rf = RandomForestClassifier(n_estimators=20)
223 |         jrf = JoblibedClassifier(rf, "rf", cache_dir='')
224 |         jrf.fit(self.iris.data, self.iris.target)
225 |         index = [i for i in range(len(self.iris.data))]
226 |         prediction2 = jrf.predict(self.iris.data, index)
227 |         self.assertTrue((prediction == prediction2).all())
228 | 
229 |     def test_regressor(self):
230 |         X, y = datasets.make_friedman1(n_samples=1200,
231 |                                        random_state=1,
232 |                                        noise=1.0)
233 |         X_train, y_train = X[:200], y[:200]
234 |         index = [i for i in range(200)]
235 | 
236 |         rf = RandomForestRegressor()
237 |         jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
238 |         jrf.fit(X_train, y_train, index)
239 |         prediction = jrf.predict(X_train, index)
240 |         mse = mean_squared_error(y_train, prediction)
241 |         assert_less(mse, 6.0)
242 | 
243 |         rf = RandomForestRegressor(n_estimators=20)
244 |         jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
245 |         jrf.fit(X_train, y_train, index)
246 |         prediction2 = jrf.predict(X_train, index)
247 |         assert_allclose(prediction, prediction2)
248 | 
249 | 
250 | if __name__ == '__main__':
251 |     unittest.main()
252 | 


--------------------------------------------------------------------------------
/stacked_generalization/lib/util.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import StratifiedKFold
  2 | import numpy as np
  3 | from sklearn.externals import joblib
  4 | import pandas as pd
  5 | 
  6 | def multiple_feature_weight(blend, X):
  7 |     result = None
  8 |     for a_vec in blend.T:
  9 |         for b_vec in X.T:
 10 |             if result is None:
 11 |                 result = a_vec * b_vec
 12 |             else:
 13 |                 result = np.c_[result, a_vec * b_vec]
 14 |     return result
 15 | 
 16 | def numpy_c_concatenate(A, B):
 17 |     if A is None:
 18 |         return B
 19 |     else:
 20 |         return np.c_[A, B]
 21 | 
 22 | def saving_predict_proba(model, X, index, cache_dir=''):
 23 |     try:
 24 |         csv_file = get_cache_file(model.id, index, cache_dir)
 25 |         df = pd.read_csv(csv_file)
 26 |         proba = df.values[:, 1:]
 27 |         print("**** prediction is loaded from {0} ****".format(csv_file))
 28 |     except IOError:
 29 |         proba = model.predict_proba(X)
 30 |         df = pd.DataFrame({'index': index})
 31 |         for i in range(proba.shape[1]):
 32 |             df["prediction" + str(i)] = proba[:, i]
 33 |         #print(df)
 34 |         df.to_csv(csv_file, index=False)
 35 |     return proba
 36 | 
 37 | def saving_predict(model, X, index, cache_dir=''):
 38 |     csv_file = get_cache_file(model.id, index,cache_dir)
 39 |     try:
 40 |         df = pd.read_csv(csv_file)
 41 |         prediction = df.values[:, 1:]
 42 |         prediction = prediction.reshape([prediction.size,])
 43 |         print("**** prediction is loaded from {0} ****".format(csv_file))
 44 |     except IOError:
 45 |         prediction = model.predict(X)
 46 |         df = pd.DataFrame({'index': index})
 47 |         prediction.reshape([prediction.shape[-1],])
 48 |         df["prediction"] = prediction
 49 |         #print(df)
 50 |         df.to_csv(csv_file, index=False)
 51 |     return prediction
 52 | 
 53 | def get_model_id(model):
 54 |     model_type = str(type(model))
 55 |     model_type = model_type[model_type.rfind(".")+1: model_type.rfind("'")]
 56 |     param_dict = model.get_params()
 57 |     ignore_list = ('n_jobs', 'oob_score', 'verbose', 'warm_start')
 58 |     new_param_dict = {}
 59 |     for key, value in sorted(param_dict.items(), key=lambda x: x[0]):
 60 |         i = 0
 61 |         if key in ignore_list:
 62 |             continue
 63 |         while True:
 64 |             new_key = key[0] + str(i)
 65 |             if not new_key in new_param_dict:
 66 |                 new_param_dict[new_key] = value
 67 |                 break
 68 |             i += 1
 69 |     model_type += str(new_param_dict)
 70 |     replace_dict = {'{': '_',
 71 |                     '}': '',
 72 |                     "'": "",
 73 |                     '.': 'p',
 74 |                     ',': '__',
 75 |                     ':': '_',
 76 |                     ' ': '',
 77 |                     'True': '1',
 78 |                     'False': '0',
 79 |                     'None': 'N',
 80 |                     '=': '_',
 81 |                     '(': '_',
 82 |                     ')': '_',
 83 |                     '\n': '_'}
 84 |     for key, value in replace_dict.items():
 85 |         model_type = model_type.replace(key, value)
 86 |     if len(model_type) > 150:
 87 |         model_type = model_type[:150]
 88 |     return model_type
 89 | 
 90 | def get_cache_file(model_id, index, cache_dir='', suffix='csv'):
 91 |     # Identify index trick.
 92 |     # If sum of first 20 index, recognize as the same index.
 93 |     if index is None:
 94 |         raise IOError
 95 |     if len(index) < 20:
 96 |         sum_index = sum(index)
 97 |     else:
 98 |         sum_index = sum(index[:20])
 99 |     return "{0}{1}_{2}.{3}".format(cache_dir,
100 |                                    model_id,
101 |                                    sum_index,
102 |                                    suffix)
103 | 
104 | ##def saving_fit(learner, X, y, index):
105 | ##    import os
106 | ##    pkl_file = "{0}_{1}_{2}.pkl".format(learner.id, min(index), max(index))
107 | ##    try:
108 | ##        learner = joblib.load(pkl_file)
109 | ##        print("**** learner is loaded from {0} ****".format(pkl_file))
110 | ##    except IOError:
111 | ##        learner.fit(X, y)
112 | ##        joblib.dump(learner, pkl_file)
113 | ##    return learner
114 | 
115 | if __name__ == '__main__':
116 |     temp = {'index': [0, 1], 'value': [2, 3]}
117 |     df = pd.DataFrame(temp)
118 |     print(df)
119 |     df.to_csv('dum.csv', index=False)
120 | 


--------------------------------------------------------------------------------