├── .gitignore
├── LICENSE
├── README.md
├── build_package.sh
├── py_boost
    ├── __init__.py
    ├── callbacks
    │   ├── __init__.py
    │   └── callback.py
    ├── cv
    │   ├── __init__.py
    │   ├── adaptive_es.py
    │   ├── base.py
    │   └── cluster_tree.py
    ├── gpu
    │   ├── __init__.py
    │   ├── base.py
    │   ├── boosting.py
    │   ├── losses
    │   │   ├── __init__.py
    │   │   ├── losses.py
    │   │   ├── metrics.py
    │   │   └── multiclass_metrics.py
    │   ├── serialization.py
    │   ├── sketch_boost.py
    │   ├── tree.py
    │   └── utils.py
    ├── multioutput
    │   ├── __init__.py
    │   ├── sketching.py
    │   └── target_splitter.py
    ├── quantization
    │   ├── __init__.py
    │   ├── base.py
    │   └── utils.py
    ├── sampling
    │   ├── __init__.py
    │   └── bagging.py
    └── utils
    │   ├── __init__.py
    │   ├── logging.py
    │   ├── onnx_wrapper.py
    │   └── tl_wrapper.py
├── pyproject.toml
└── tutorials
    ├── Tutorial_1_Basics.ipynb
    ├── Tutorial_2_Advanced_multioutput.ipynb
    ├── Tutorial_3_Custom_features.ipynb
    ├── Tutorial_4_Handle_null_targets.ipynb
    └── Tutorial_5_ONNX_inference.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | multioutpout_benchmark/
  4 | tutorials/Experiment.ipynb
  5 | data/
  6 | catboost_info/
  7 | py_boost_venv
  8 | publish_package.sh
  9 | reinstall.sh
 10 | poetry.lock
 11 | .idea
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
107 | __pypackages__/
108 | 
109 | # Celery stuff
110 | celerybeat-schedule
111 | celerybeat.pid
112 | 
113 | # SageMath parsed files
114 | *.sage.py
115 | 
116 | # Environments
117 | .env
118 | .venv
119 | env/
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 
143 | # pytype static type analyzer
144 | .pytype/
145 | 
146 | # Cython debug symbols
147 | cython_debug/
148 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021 Vakhrusev Anton, Iosipoi Leonid
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Py-boost: a research tool for exploring GBDTs
 2 | 
 3 | Modern gradient boosting toolkits are very complex and are written in low-level programming languages. As a result,
 4 | 
 5 | * It is hard to customize them to suit one’s needs
 6 | * New ideas and methods are not easy to implement
 7 | * It is difficult to understand how they work
 8 | 
 9 | Py-boost is a Python-based gradient boosting library which aims at overcoming the aforementioned problems.
10 | 
11 | **Authors**: [Anton Vakhrushev](https://kaggle.com/btbpanda), [Leonid Iosipoi](http://iosipoi.com/)
12 | , [Sergey Kupriyanov](https://www.linkedin.com/in/sergeykupriyanov).
13 | 
14 | ## Py-boost Key Features
15 | 
16 | **Simple**. Py-boost is a simplified gradient boosting library, but it supports all main features and hyperparameters
17 | available in other implementations.
18 | 
19 | **Fast with GPU**. Despite the fact that Py-boost is written in Python, it works only on GPU and uses Python GPU
20 | libraries such as `CuPy` and `Numba`.
21 | 
22 | **Efficient inference**. Since v0.4 Py-Boost is able to perform the efficient inference of tree ensembles on GPU.
23 | Moreover, ones your model is trained on GPU, it could be converted to perform the inference on CPU only machine via
24 | converting to the [treelite](https://treelite.readthedocs.io/) format with build-in wrapper (limitation - model should
25 | be trained with `target_splitter='Single'`, which is the default).
26 | 
27 | **ONNX compatible** Since v0.5 Py-Boost is compatible with ONNX format that allows more options the CPU inference and
28 | model deployment.
29 | 
30 | **Easy to customize**. Py-boost can be easily customized even if one is not familiar with GPU programming (just replace
31 | np with cp). What can be customized? Almost everything via custom callbacks. Examples: Row/Col sampling strategy,
32 | Training control, Losses/metrics, Multioutput handling strategy, Anything via custom callbacks
33 | 
34 | ## SketchBoost [paper](https://openreview.net/forum?id=WSxarC8t-T)
35 | 
36 | **Multioutput training**. Current state-of-atr boosting toolkits provide very limited support of multioutput training.
37 | And even if this option is available, training time for such tasks as multiclass/multilabel classification and multitask
38 | regression is quite slow because of the training complexity that scales linearly with the number of outputs. To overcome
39 | the existing limitations we create **SketchBoost** algorithm that uses approximate tree structure search. As we show
40 | in [paper](https://openreview.net/forum?id=WSxarC8t-T) that strategy at least does not lead to performance decrease and
41 | often is able to improve the accuracy
42 | 
43 | **SketchBoost**. You can try our sketching strategies by using `SketchBoost` class or if you want you can implement your
44 | own and pass to the `GradientBoosting` constructor as `multioutput_sketch` parameter. For the details please
45 | see [Tutorial_2_Advanced_multioutput](https://github.com/AILab-MLTools/Py-Boost/blob/master/tutorials/Tutorial_2_Advanced_multioutput.ipynb)
46 | 
47 | ## Installation
48 | 
49 | Before installing py-boost via pip you should have cupy installed. You can use:
50 | 
51 | `pip install -U cupy-cuda110 py-boost`
52 | 
53 | **Note**: replace with your cuda version! For the details see [this guide](https://docs.cupy.dev/en/stable/install.html)
54 | 
55 | ## Quick tour
56 | 
57 | Py-boost is easy to use since it has similar to scikit-learn interface. For usage example please see:
58 | 
59 | * [Tutorial_1_Basics](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_1_Basics.ipynb) for simple
60 |   usage examples
61 | * [Tutorial_2_Advanced_multioutput](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_2_Advanced_multioutput.ipynb)
62 |   for advanced multioutput features
63 | * [Tutorial_3_Custom_features](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_3_Custom_features.ipynb)
64 |   for examples of customization
65 | * [Tutorial_4_Handle_null_targets](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_4_Handle_null_targets.ipynb)
66 |   for the case when multioutput target contains NaNs
67 | * [Tutorial_5_ONNX_inference](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_5_ONNX_inference.ipynb)
68 |   examples of parsing and inference on CPU with ONNX
69 | 
70 | More examples are coming soon
71 | 


--------------------------------------------------------------------------------
/build_package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -rf py_boost_venv
 4 | python -m venv py_boost_venv
 5 | source ./py_boost_venv/bin/activate
 6 | 
 7 | pip install -U pip
 8 | pip install -U poetry
 9 | pip install -U cupy-cuda110
10 | 
11 | poetry lock
12 | poetry install
13 | poetry build
14 | 


--------------------------------------------------------------------------------
/py_boost/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | import sys
 4 | import warnings
 5 | 
 6 | _root_logger = logging.getLogger()
 7 | _logger = logging.getLogger(__name__)
 8 | _logger.setLevel(logging.WARNING)
 9 | 
10 | # if root logger has handlers, propagate messages up and let root logger process them
11 | if not _root_logger.hasHandlers():
12 |     _logger.addHandler(logging.StreamHandler(sys.stdout))
13 |     _logger.propagate = False
14 | 
15 | try:
16 |     subprocess.check_output('nvidia-smi')
17 |     CUDA_FOUND = True
18 | except Exception:
19 |     CUDA_FOUND = False
20 | 
21 | from .utils.tl_wrapper import TLPredictor, TLCompiledPredictor
22 | from .utils.onnx_wrapper import pb_to_onnx, ONNXPredictor
23 | 
24 | if CUDA_FOUND:
25 |     from .gpu.boosting import GradientBoosting
26 |     from .gpu.sketch_boost import SketchBoost
27 |     from .gpu.losses.losses import Loss
28 |     from .gpu.losses.metrics import Metric
29 |     from .callbacks.callback import Callback
30 | 
31 |     __all__ = [
32 | 
33 |         'GradientBoosting',
34 |         'SketchBoost',
35 |         'Callback',
36 |         'Loss',
37 |         'Metric',
38 |         'callbacks',
39 |         'gpu',
40 |         'multioutput',
41 |         'sampling',
42 |         'utils',
43 |         'pb_to_onnx',
44 | 
45 |     ]
46 | 
47 | else:
48 |     warnings.warn('No Nvidia GPU detected! Only treelite inference on CPU is available')
49 |     __all__ = []
50 | 
51 | __all__.extend([
52 | 
53 |     'TLPredictor',
54 |     'TLCompiledPredictor',
55 |     'ONNXPredictor'
56 | 
57 | ])
58 | 
59 | try:
60 |     import importlib.metadata as importlib_metadata
61 | except ModuleNotFoundError:
62 |     import importlib_metadata
63 | 
64 | __version__ = importlib_metadata.version(__name__)
65 | 


--------------------------------------------------------------------------------
/py_boost/callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sb-ai-lab/Py-Boost/1bb23905a90793dbf8bb6e50b9bc4a26b1f556c4/py_boost/callbacks/__init__.py


--------------------------------------------------------------------------------
/py_boost/callbacks/callback.py:
--------------------------------------------------------------------------------
  1 | """Default callbacks"""
  2 | import logging
  3 | from ..utils.logging import verbosity_to_loglevel, set_stdout_level
  4 | 
  5 | logger = logging.getLogger(__name__)
  6 | 
  7 | 
  8 | class Callback:
  9 |     """Abstract class for callback. All Callback methods define the actions, should be done between training stages
 10 |     There are 4 methods, that could be redefined:
 11 |         - before_train - outputs None
 12 |         - before_iteration - outputs None
 13 |         - after_train - outputs None
 14 |         - after_iteration - outputs bool - if training should be stopped after iteration
 15 | 
 16 |     Methods received build_info - the state dict, that could be accessed and modifier
 17 | 
 18 |     Basic build info structure:
 19 | 
 20 |     build_info = {
 21 |             'data': {
 22 |                 'train': {
 23 |                     'features_cpu': np.ndarray - raw feature matrix,
 24 |                     'features_gpu': cp.ndarray - uint8 quantized feature matrix on GPU,
 25 |                     'target': y - cp.ndarray - processed target variable on GPU,
 26 |                     'sample_weight': cp.ndarray - processed sample_weight on GPU or None,
 27 |                     'ensemble': cp.ndarray - current model prediction (with no postprocessing,
 28 |                         ex. before sigmoid for logloss) on GPU,
 29 |                     'grad': cp.ndarray of gradients on GPU, before first iteration - None,
 30 |                     'hess': cp.ndarray of hessians on GPU, before first iteration - None,
 31 | 
 32 |                     'last_tree': {
 33 |                         'nodes': cp.ndarray - nodes indices of the last trained tree,
 34 |                         'preds': cp.ndarray - predictions of the last trained tree,
 35 |                     }
 36 | 
 37 |                 },
 38 |                 'valid': {
 39 |                     'features_cpu' the same as train, but list, each element corresponds each validation sample,
 40 |                     'features_gpu': ...,
 41 |                     'target': ...,
 42 |                     'sample_weight': ...,
 43 |                     'ensemble': ...,
 44 | 
 45 |                     'last_tree': {
 46 |                         'nodes': ...,
 47 |                         'preds': ...,
 48 |                     }
 49 | 
 50 |                 }
 51 |             },
 52 |             'borders': list of np.ndarray - list or quantization borders,
 53 |             'model': GradientBoosting - model, that is trained,
 54 |             'mempool': cp.cuda.MemoryPool - memory pool used for train, could be used to clean memory to prevent OOM,
 55 |             'builder': DepthwiseTreeBuilder - the instance of tree builder, contains training params,
 56 | 
 57 |             'num_iter': int, current number of iteration,
 58 |             'iter_scores': list of float - list of metric values for all validation sets for the last iteration,
 59 |         }
 60 | 
 61 |     """
 62 | 
 63 |     def before_train(self, build_info):
 64 |         """Actions to be made before train starts
 65 | 
 66 |         Args:
 67 |             build_info: dict
 68 | 
 69 |         Returns:
 70 | 
 71 |         """
 72 |         return
 73 | 
 74 |     def before_iteration(self, build_info):
 75 |         """Actions to be made before each iteration starts
 76 | 
 77 |         Args:
 78 |             build_info: dict
 79 | 
 80 |         Returns:
 81 | 
 82 |         """
 83 |         return
 84 | 
 85 |     def after_iteration(self, build_info):
 86 |         """Actions to be made after each iteration finishes
 87 | 
 88 |         Args:
 89 |             build_info: dict
 90 | 
 91 |         Returns:
 92 |             bool, if train process should be terminated
 93 |         """
 94 |         return False
 95 | 
 96 |     def after_train(self, build_info):
 97 |         """Actions to be made before train finishes
 98 | 
 99 |         Args:
100 |             build_info:
101 | 
102 |         Returns:
103 | 
104 |         """
105 |         return
106 | 
107 | 
108 | class CallbackPipeline:
109 |     """Sequential pipeline of callbacks"""
110 | 
111 |     def __init__(self, *callbacks):
112 |         self.callbacks = callbacks
113 | 
114 |     def after_iteration(self, build_info):
115 |         stop = False
116 | 
117 |         for callback in self.callbacks:
118 |             stop = stop or callback.after_iteration(build_info)
119 | 
120 |         return stop
121 | 
122 |     def after_train(self, build_info):
123 | 
124 |         for callback in self.callbacks:
125 |             callback.after_train(build_info)
126 | 
127 |     def before_train(self, build_info):
128 | 
129 |         for callback in self.callbacks:
130 |             callback.before_train(build_info)
131 | 
132 |     def before_iteration(self, build_info):
133 | 
134 |         for callback in self.callbacks:
135 |             callback.before_iteration(build_info)
136 | 
137 | 
138 | class EvalHistory(Callback):
139 |     """Callback for history evaluation"""
140 | 
141 |     def __init__(self, history, verbose=0):
142 | 
143 |         self.history = history
144 |         self.verbose = verbose
145 |         self.metric = None
146 |         self.postprocess_fn = None
147 |         self.ntrees = None
148 | 
149 |     def before_train(self, build_info):
150 |         """Init params and logger
151 | 
152 |         Args:
153 |             build_info: dict
154 | 
155 |         Returns:
156 | 
157 |         """
158 |         self.metric = build_info['model'].metric
159 |         self.postprocess_fn = build_info['model'].loss.postprocess_output
160 |         self.ntrees = build_info['model'].ntrees
161 | 
162 |         self.set_verbosity_level(int(self.verbose > 0) * 1)
163 | 
164 |         msg = 'GDBT train starts. Max iter {0}, early stopping rounds {1}'.format(
165 |             build_info['model'].ntrees, build_info['model'].es)
166 | 
167 |         logger.info(msg)
168 | 
169 |     def after_iteration(self, build_info):
170 |         """Save the iteration results and output log
171 | 
172 |         Args:
173 |             build_info: dict
174 | 
175 |         Returns:
176 | 
177 |         """
178 |         valid = build_info['data']['valid']
179 |         y_val, val_ens, w_val = valid['target'], valid['ensemble'], valid['sample_weight']
180 | 
181 |         num_iter = build_info['num_iter']
182 | 
183 |         msg = 'Iter {0}; '.format(num_iter)
184 | 
185 |         if self.metric is None:
186 |             return
187 | 
188 |         alias = self.metric.alias
189 | 
190 |         if len(y_val) > 0:
191 |             val_metric = [float(self.metric(y, self.postprocess_fn(x), w)) for (y, x, w) in zip(y_val, val_ens, w_val)]
192 |             self.history.append(val_metric)
193 | 
194 |             msg += ' '.join(['Sample {0}, {1} = {2}; '.format(n, alias, x) for (n, x) in enumerate(val_metric)])
195 | 
196 |             build_info['iter_score'] = val_metric
197 | 
198 |         if ((num_iter % self.verbose) == 0) or (num_iter == (self.ntrees - 1)):
199 |             logger.info(msg)
200 | 
201 |     @staticmethod
202 |     def set_verbosity_level(verbose):
203 |         """Verbosity level setter.
204 | 
205 |         Args:
206 |             verbose: Controls the verbosity: the higher, the more messages.
207 |                 <1  : messages are not displayed;
208 |                 >=1 : the computation process for layers is displayed;
209 |                 >=2 : the information about folds processing is also displayed;
210 |                 >=3 : the hyperparameters optimization process is also displayed;
211 |                 >=4 : the training process for every algorithm is displayed;
212 |         """
213 |         level = verbosity_to_loglevel(verbose)
214 |         set_stdout_level(level)
215 | 
216 |         logger.info(f"Stdout logging level is {logging._levelToName[level]}.")
217 | 
218 | 
219 | class EarlyStopping(Callback):
220 |     """Callback for early stopping"""
221 | 
222 |     def __init__(self, num_rounds=100):
223 | 
224 |         self.num_rounds = num_rounds
225 |         self.best_round = 1
226 |         self.no_upd_rounds = 0
227 |         self.best_score = None
228 |         self.metric = None
229 | 
230 |     def before_train(self, build_info):
231 |         """Init params
232 | 
233 |         Args:
234 |             build_info: dict
235 | 
236 |         Returns:
237 | 
238 |         """
239 |         self.metric = build_info['model'].metric
240 | 
241 |     def after_iteration(self, build_info):
242 |         """Check early stopping condition and update the state
243 | 
244 |         Args:
245 |             build_info: dict
246 | 
247 |         Returns:
248 |             bool, if early stopping condition was met
249 |         """
250 |         if ('iter_score' not in build_info) or (self.num_rounds == 0):
251 |             return False
252 | 
253 |         num_iter = build_info['num_iter']
254 |         # if multiple valid sets passed - use the last one
255 |         score = build_info['iter_score'][-1]
256 | 
257 |         if num_iter == 0:
258 |             self.best_score = score
259 |             return False
260 | 
261 |         if self.metric.compare(score, self.best_score):
262 |             self.best_score = score
263 |             self.best_round = num_iter + 1
264 |             self.no_upd_rounds = 0
265 |             return False
266 | 
267 |         self.no_upd_rounds += 1
268 | 
269 |         stop = self.no_upd_rounds >= self.num_rounds
270 | 
271 |         if stop:
272 |             msg = 'Early stopping at iter {0}, best iter {1}, best_score {2}'.format(
273 |                 num_iter + 1, self.best_round, self.best_score)
274 |             logger.info(msg)
275 | 
276 |         return stop
277 | 
278 |     def after_train(self, build_info):
279 |         """Prune the model to the best iteration
280 | 
281 |         Args:
282 |             build_info: dict
283 | 
284 |         Returns:
285 | 
286 |         """
287 |         if self.best_score is not None:
288 |             model = build_info['model']
289 |             model.models = model.models[:self.best_round]
290 |             model.best_round = self.best_round
291 | 


--------------------------------------------------------------------------------
/py_boost/cv/__init__.py:
--------------------------------------------------------------------------------
 1 | """Tools for cross validation"""
 2 | 
 3 | from .base import CrossValidation
 4 | from .adaptive_es import AdaptiveESCV
 5 | from .cluster_tree import ClusterCandidates
 6 | 
 7 | __all__ = [
 8 | 
 9 |     'CrossValidation',
10 |     'AdaptiveESCV',
11 |     'ClusterCandidates'
12 | 
13 | ]
14 | 


--------------------------------------------------------------------------------
/py_boost/cv/adaptive_es.py:
--------------------------------------------------------------------------------
  1 | """Adaptive early stopping"""
  2 | 
  3 | import numpy as np
  4 | try:
  5 |     import cupy as cp
  6 | except Exception:
  7 |     pass
  8 | from copy import deepcopy
  9 | from numba import njit
 10 | 
 11 | from ..gpu.losses import MSELoss, CrossEntropyLoss, BCELoss, loss_alias
 12 | from ..gpu.utils import validate_input
 13 | 
 14 | from .base import CrossValidation
 15 | 
 16 | 
 17 | def check_input(y_true, sample_weight):
 18 |     if len(y_true.shape) == 1:
 19 |         y_true = y_true[:, np.newaxis]
 20 | 
 21 |     y_true = y_true[np.newaxis, :, :]
 22 | 
 23 |     if sample_weight is not None and len(sample_weight.shape) == 1:
 24 |         sample_weight = sample_weight[:, np.newaxis]
 25 | 
 26 |     return y_true, sample_weight
 27 | 
 28 | 
 29 | def bce_scorer(y_true, y_pred, sample_weight=None):
 30 |     """
 31 | 
 32 |     Args:
 33 |         y_true: (nobj, nout)
 34 |         y_pred: (niter, nobj, nout)
 35 |         sample_weight: (nobj, 1)
 36 | 
 37 |     Returns:
 38 | 
 39 |     """
 40 |     y_true, sample_weight = check_input(y_true, sample_weight)
 41 | 
 42 |     path = -np.log(y_true * y_pred + (1 - y_true) * (1 - y_pred))
 43 |     path = path.sum(axis=-1).T
 44 | 
 45 |     if sample_weight is not None:
 46 |         path *= sample_weight
 47 | 
 48 |     return path
 49 | 
 50 | 
 51 | def mse_scorer(y_true, y_pred, sample_weight=None):
 52 |     """
 53 | 
 54 |     Args:
 55 |         y_true: (nobj, nout)
 56 |         y_pred: (niter, nobj, nout)
 57 |         sample_weight: (nobj, 1)
 58 | 
 59 |     Returns:
 60 | 
 61 |     """
 62 |     y_true, sample_weight = check_input(y_true, sample_weight)
 63 | 
 64 |     path = (y_true - y_pred) ** 2
 65 |     path = path.sum(axis=-1).T
 66 | 
 67 |     if sample_weight is not None:
 68 |         path *= sample_weight
 69 | 
 70 |     return path
 71 | 
 72 | 
 73 | def cent_scorer(y_true, y_pred, sample_weight=None):
 74 |     """
 75 | 
 76 |     Args:
 77 |         y_true: (nobj, nout)
 78 |         y_pred: (niter, nobj, nout)
 79 |         sample_weight: (nobj, 1)
 80 | 
 81 |     Returns:
 82 | 
 83 |     """
 84 |     y_true, sample_weight = check_input(y_true, sample_weight)
 85 | 
 86 |     path = -np.log(np.take_along_axis(y_pred, y_true, axis=2)[..., 0].T)
 87 | 
 88 |     if sample_weight is not None:
 89 |         path *= sample_weight
 90 | 
 91 |     return path
 92 | 
 93 | 
 94 | @njit
 95 | def select_preds(arr, leaves, order):
 96 |     """Select corresponding to cluster prediction
 97 | 
 98 |     Args:
 99 |         arr: np.ndarray, predictions
100 |         leaves: np.ndarray, clusters
101 |         order: np.ndarray, maps cluster label with position in prediction array
102 | 
103 |     Returns:
104 |         np.ndarray, pruned prediction
105 |     """
106 |     res = np.empty(arr.shape[1:], dtype=arr.dtype)
107 | 
108 |     for i in range(leaves.shape[0]):
109 |         res[i] = arr[order[leaves[i]], i, :]
110 | 
111 |     return res
112 | 
113 | 
114 | class AdaptiveESCV(CrossValidation):
115 |     """
116 |     Cross validation wrapper with built-in adaptive early stopping
117 |     """
118 | 
119 |     def __init__(self, base_learner, cluster, iters_to_fit, metric=None, random_state=42, batch_size=10000):
120 |         super().__init__(deepcopy(base_learner), random_state)
121 |         self._base_learner.params['es'] = 0
122 |         self.cluster = cluster
123 |         self.iters_to_fit = iters_to_fit
124 |         self.metric = metric
125 |         self.batch_size = batch_size
126 | 
127 |         self.best_split = None
128 |         self.best_trees = None
129 |         self.best_oof_trees = None
130 | 
131 |     def get_es_metric(self):
132 | 
133 |         if self.metric:
134 |             return self.metric
135 | 
136 |         loss = self._base_learner.params['loss']
137 |         if type(loss) is str:
138 |             loss = loss_alias[loss]
139 | 
140 |         if type(loss) is MSELoss:
141 |             return mse_scorer
142 | 
143 |         if type(loss) is BCELoss:
144 |             return bce_scorer
145 | 
146 |         if type(loss) is CrossEntropyLoss:
147 |             return cent_scorer
148 | 
149 |         raise ValueError('Unknown loss func. Please specify metric manually')
150 | 
151 |     def fit_predict(self, X, y, sample_weight=None, cv=5, stratify=False, random_state=42):
152 |         """
153 | 
154 |         Args:
155 |             X:
156 |             y:
157 |             sample_weight:
158 |             cv:
159 |             stratify:
160 |             random_state:
161 | 
162 |         Returns:
163 | 
164 |         """
165 |         assert self.models is None, 'Is already trained'
166 | 
167 |         self.models = []
168 | 
169 |         X, y, sample_weight, eval_sets = validate_input(X, y, sample_weight, {})
170 |         self._base_learner._infer_params()
171 |         X_enc, max_bin, borders, eval_enc = self._base_learner.quantize(X, eval_sets)
172 | 
173 |         # create validation
174 |         cv_iter = self.get_cv_iter(cv, stratify, random_state)
175 | 
176 |         # fit and free memory
177 |         mempool = cp.cuda.MemoryPool()
178 | 
179 |         oof_pred, folds = self._fit_predict(mempool, X, X_enc, y, sample_weight, max_bin, borders, cv_iter)
180 |         self.fit_cluster_tree(X, X_enc, y, sample_weight, max_bin, borders, folds)
181 |         self.search_for_best_cluster(X, y, sample_weight, folds)
182 | 
183 |         # create out of fold pruned prediction
184 | 
185 |         for f in range(folds.max() + 1):
186 |             idx = np.arange(X_enc.shape[0])[folds == f]
187 |             X_test = X[idx]
188 |             pred = self._get_stages([self.models[f]], self.best_oof_trees[f], X_test, batch_size=self.batch_size)
189 |             oof_pred[idx] = self._prune_preds(self.best_oof_trees[f], X_test, pred, batch_size=self.batch_size)
190 | 
191 |         return oof_pred
192 | 
193 |     def fit_cluster_tree(self, X, X_enc, y, sample_weight, max_bin, borders, folds):
194 |         """Fit cluster tree
195 | 
196 |         Args:
197 |             X:
198 |             X_enc:
199 |             y:
200 |             sample_weight:
201 |             max_bin:
202 |             borders:
203 |             folds:
204 | 
205 |         Returns:
206 | 
207 |         """
208 |         paths = np.zeros((X_enc.shape[0], len(self.iters_to_fit)), dtype=np.float32)
209 |         scorer = self.get_es_metric()
210 | 
211 |         for f in range(folds.max() + 1):
212 |             idx = np.arange(X_enc.shape[0])[folds == f]
213 |             val_pred = self.models[f].predict_staged(X[idx], iterations=self.iters_to_fit)
214 |             paths[idx] = scorer(y[idx], val_pred, None if sample_weight is None else sample_weight[idx])
215 | 
216 |         self.cluster.fit_quantized(X_enc, paths, max_bin, borders)
217 |         self.cluster.to_cpu()
218 | 
219 |     def search_for_best_cluster(self, X, y, sample_weight, folds):
220 |         """Search for the best cluster tree
221 | 
222 |         Args:
223 |             X:
224 |             y:
225 |             sample_weight:
226 |             folds:
227 | 
228 |         Returns:
229 | 
230 |         """
231 |         # predict cluster trees
232 |         cl_ = self.cluster.predict(X)
233 |         # zero clustering is a simple early stopping
234 |         clusters = np.zeros((cl_.shape[0], cl_.shape[1] + 1), dtype=np.uint32)
235 |         clusters[:, 1:] = cl_
236 | 
237 |         scorer = self.get_es_metric()
238 |         n_cand = clusters.shape[1]
239 |         clust_per_split = clusters.max(axis=0) + 1
240 |         nfolds = folds.max() + 1
241 |         max_clust = clust_per_split.max()
242 |         iter_num = self._base_learner.params['ntrees']
243 |         batch_size = 1000
244 | 
245 |         folds_stats = np.zeros((nfolds, n_cand, max_clust, iter_num), dtype=np.float32)
246 | 
247 |         # calculate oof errors
248 |         for f in range(nfolds):
249 |             idx = np.arange(X.shape[0])[folds == f]
250 |             X_test, y_test, cl_test = X[idx], y[idx], clusters[idx]
251 | 
252 |             for i in range(0, X_test.shape[0], batch_size):
253 | 
254 |                 val_pred = self.models[f].predict_staged(X_test[i:i + batch_size])
255 |                 err = scorer(y_test[i:i + batch_size], val_pred,
256 |                              None if sample_weight is None else sample_weight[i:i + batch_size])
257 | 
258 |                 for j in range(n_cand):
259 |                     np.add.at(folds_stats[f, j], (cl_test[i:i + batch_size, j],), err)
260 | 
261 |         # select best by oof
262 |         stats = folds_stats.sum(axis=0)  # shape (nsplits, max_clust, niters)
263 |         oof_stats = stats[np.newaxis, ...] - folds_stats  # shape (nfolds, nsplits, max_clust, niters)
264 | 
265 |         best_iters = oof_stats.argmin(axis=-1)  # shape (nfolds, nsplits, max_clust)
266 |         best_errs = np.take_along_axis(folds_stats, best_iters[..., np.newaxis], axis=3)[..., 0].sum(
267 |             axis=0)  # shape  (nsplits, max_clust)
268 |         self.best_split = best_errs.sum(axis=1).argmin()  # scalar
269 |         best_oof_trees = best_iters[:, self.best_split]  # shape (nfolds, max_clust)
270 |         self.best_oof_trees = best_oof_trees[:, :clust_per_split[self.best_split]]
271 | 
272 |         # select best in total
273 |         best_trees = stats[self.best_split].argmin(axis=-1)  # shape (max_clust, )
274 |         self.best_trees = best_trees[:clust_per_split[self.best_split]]
275 | 
276 |     def _get_stages(self, models, iters, X, batch_size=100000):
277 |         """
278 | 
279 |         Args:
280 |             models:
281 |             iters:
282 |             X:
283 |             batch_size:
284 | 
285 |         Returns:
286 | 
287 |         """
288 |         sorted_iters = np.sort(np.unique(iters))
289 |         pred = models[0].predict_staged(X, iterations=sorted_iters, batch_size=batch_size)
290 | 
291 |         for i in range(1, len(models)):
292 |             pred += models[i].predict_staged(X, iterations=sorted_iters, batch_size=batch_size)
293 | 
294 |         pred /= len(models)
295 | 
296 |         return pred
297 | 
298 |     def _prune_preds(self, iters, X, pred, batch_size=100000):
299 |         """
300 | 
301 |         Args:
302 |             iters:
303 |             X:
304 |             pred:
305 |             batch_size:
306 | 
307 |         Returns:
308 | 
309 |         """
310 |         if self.best_split == 0:
311 |             cluster = np.zeros((X.shape[0],), dtype=np.uint32)
312 |         else:
313 |             cluster = self.cluster.predict(X, iterations=[self.best_split - 1], batch_size=batch_size)[:, 0]
314 | 
315 |         sorted_iters = np.sort(np.unique(iters))
316 |         order = np.searchsorted(sorted_iters, iters)
317 | 
318 |         return select_preds(pred, cluster, order)
319 | 
320 |     def predict(self, X, batch_size=100000):
321 |         """
322 | 
323 |         Args:
324 |             X:
325 |             batch_size:
326 | 
327 |         Returns:
328 | 
329 |         """
330 |         pred = self._get_stages(self.models, self.best_trees, X, batch_size=batch_size)
331 |         return self._prune_preds(self.best_trees, X, pred, batch_size=batch_size)
332 | 


--------------------------------------------------------------------------------
/py_boost/cv/base.py:
--------------------------------------------------------------------------------
  1 | """Gradient Boosting with built-in cross validation"""
  2 | 
  3 | import numpy as np
  4 | try:
  5 |     import cupy as cp
  6 | except Exception:
  7 |     pass
  8 | from copy import deepcopy
  9 | 
 10 | from sklearn.model_selection import KFold, StratifiedKFold
 11 | from ..gpu.utils import validate_input
 12 | 
 13 | 
 14 | class CustomFolds:
 15 |     """
 16 |     Class to imitate sklearn cv for custom folds
 17 |     """
 18 | 
 19 |     def __init__(self, folds):
 20 |         self.folds = folds
 21 | 
 22 |     def split(self, *args, **kwargs):
 23 |         nfolds = int(self.folds.max() + 1)
 24 |         idx = np.arange(len(self.folds))
 25 | 
 26 |         splits = []
 27 | 
 28 |         for i in range(nfolds):
 29 |             splits.append((idx[self.folds != i], idx[self.folds == i]))
 30 | 
 31 |         return splits
 32 | 
 33 | 
 34 | class CrossValidation:
 35 |     """
 36 |     Cross validation wrapper for gradient boosting
 37 |     """
 38 | 
 39 |     def __init__(self, base_learner, random_state=42):
 40 |         """
 41 | 
 42 |         Args:
 43 |             base_learner:
 44 |             random_state:
 45 |         """
 46 |         self._base_learner = base_learner
 47 |         self.random_state = random_state
 48 |         self.models = None
 49 | 
 50 |     def _fit_predict(self, mempool, X, X_enc, y, sample_weight, max_bin, borders, cv_iter):
 51 | 
 52 |         oof_pred = None
 53 |         folds = np.zeros(X.shape[0], dtype=np.int32)
 54 | 
 55 |         with cp.cuda.using_allocator(allocator=mempool.malloc):
 56 | 
 57 |             for n, (f0, f1) in enumerate(cv_iter.split(X, y)):
 58 | 
 59 |                 # split data
 60 | 
 61 |                 X_tr, X_enc_tr, y_tr, = X[f0], X_enc[f0], y[f0]
 62 | 
 63 |                 sample_weight_tr = None
 64 |                 if sample_weight is not None:
 65 |                     sample_weight_tr = sample_weight[f0]
 66 | 
 67 |                 eval_sets = [{
 68 | 
 69 |                     'X': X[f1],
 70 |                     'y': y[f1],
 71 |                     'sample_weight': None if sample_weight is None else sample_weight[f1]
 72 | 
 73 |                 }]
 74 | 
 75 |                 eval_enc = [X_enc[f1]]
 76 | 
 77 |                 # fit model
 78 |                 model = deepcopy(self._base_learner)
 79 |                 model._infer_params()
 80 |                 builder, build_info = model._create_build_info(mempool, X_tr, X_enc_tr, y_tr, sample_weight_tr,
 81 |                                                                max_bin, borders, eval_sets, eval_enc)
 82 |                 model._fit(builder, build_info)
 83 | 
 84 |                 # predict
 85 | 
 86 |                 val_pred = model.predict(eval_sets[0]['X'])
 87 |                 model.to_cpu()
 88 | 
 89 |                 if oof_pred is None:
 90 |                     oof_pred = np.zeros((X.shape[0], val_pred.shape[1]), dtype=np.float32)
 91 | 
 92 |                 oof_pred[f1] = val_pred
 93 |                 folds[f1] = n
 94 |                 self.models.append(model)
 95 | 
 96 |                 mempool.free_all_blocks()
 97 | 
 98 |         return oof_pred, folds
 99 | 
100 |     def get_cv_iter(self, cv, stratify, random_state):
101 | 
102 |         if type(cv) in [int, float]:
103 |             cv = int(cv)
104 |             if stratify:
105 |                 folds = StratifiedKFold(cv, shuffle=True, random_state=random_state)
106 |             else:
107 |                 folds = KFold(cv, shuffle=True)
108 | 
109 |         else:
110 |             folds = CustomFolds(cv)
111 | 
112 |         return folds
113 | 
114 |     def fit_predict(self, X, y, sample_weight=None, cv=5, stratify=False, random_state=42):
115 |         """
116 | 
117 |         Args:
118 |             X:
119 |             y:
120 |             sample_weight:
121 |             cv:
122 |             stratify:
123 |             random_state:
124 | 
125 |         Returns:
126 | 
127 |         """
128 |         assert self.models is None, 'Is already trained'
129 | 
130 |         self.models = []
131 | 
132 |         X, y, sample_weight, eval_sets = validate_input(X, y, sample_weight, {})
133 |         self._base_learner._infer_params()
134 |         X_enc, max_bin, borders, eval_enc = self._base_learner.quantize(X, eval_sets)
135 | 
136 |         # create validation
137 |         cv_iter = self.get_cv_iter(cv, stratify, random_state)
138 | 
139 |         # fit and free memory
140 |         mempool = cp.cuda.MemoryPool()
141 | 
142 |         oof_pred, folds = self._fit_predict(mempool, X, X_enc, y, sample_weight, max_bin, borders, cv_iter)
143 | 
144 |         return oof_pred
145 | 
146 |     def predict(self, X):
147 |         """
148 | 
149 |         Args:
150 |             X:
151 | 
152 |         Returns:
153 | 
154 |         """
155 |         res = None
156 | 
157 |         for model in self.models:
158 | 
159 |             pred = model.predict(X)
160 |             if res is None:
161 |                 res = pred
162 |             else:
163 |                 res += pred
164 | 
165 |         res /= len(self.models)
166 | 
167 |         return res
168 | 


--------------------------------------------------------------------------------
/py_boost/cv/cluster_tree.py:
--------------------------------------------------------------------------------
  1 | from ..gpu.utils import *
  2 | from ..gpu.tree import *
  3 | from ..gpu.base import Ensemble
  4 | 
  5 | from ..quantization.base import QuantileQuantizer
  6 | 
  7 | 
  8 | def cluster_grow_tree(tree, group, arr, grad, hess, row_indexer, col_indexer, params):
  9 |     """Graw tree for advanced pruning
 10 | 
 11 |     Args:
 12 |         tree:
 13 |         group:
 14 |         arr:
 15 |         grad:
 16 |         hess:
 17 |         row_indexer:
 18 |         col_indexer:
 19 |         params:
 20 | 
 21 |     Returns:
 22 | 
 23 |     """
 24 |     # create gh
 25 |     gh = cp.concatenate((grad, hess), axis=1)
 26 |     out_indexer = cp.arange(gh.shape[1], dtype=cp.uint64)
 27 | 
 28 |     # init nodes with single zero node
 29 |     unique_nodes = np.zeros(1, dtype=np.int32)
 30 |     # count unique nodes in active rows
 31 |     nodes_count = cp.ones(1, dtype=cp.uint64) * row_indexer.shape[0]
 32 |     # nodes for all rows
 33 |     nodes = cp.zeros(arr.shape[0], dtype=cp.int32)
 34 |     # index of node in unique array
 35 |     node_indexes = nodes
 36 |     prev_hist, small_index, big_index = [None] * 3
 37 | 
 38 |     for niter in range(params['max_depth']):
 39 | 
 40 |         nnodes = len(unique_nodes)
 41 |         gh_hist = histogram(arr, gh, node_indexes,
 42 |                             col_indexer=col_indexer,
 43 |                             row_indexer=row_indexer,
 44 |                             out_indexer=out_indexer,
 45 |                             nnodes=nnodes,
 46 |                             max_bins=params['max_bin'],
 47 |                             prev_hist=prev_hist,
 48 |                             small_index=small_index,
 49 |                             big_index=big_index)
 50 | 
 51 |         # assume hess is the last output
 52 | 
 53 |         hist, counts = gh_hist[:-1], gh_hist[-1]
 54 |         total = hist[..., :1, -1:]
 55 |         curr = total.min(axis=0)
 56 |         gain = cp.zeros(hist.shape[1:] + (2,), dtype=cp.float32)
 57 | 
 58 |         # NAN to left
 59 |         gain[..., 0] = curr - hist.min(axis=0) - (total - hist).min(axis=0)
 60 |         gain[..., 0] *= cp.minimum(counts, counts[..., -1:] - counts) >= params['min_data_in_leaf']
 61 | 
 62 |         # NAN to right
 63 |         gain[..., 1] = curr - (hist - hist[..., :1]).min(axis=0) - (total - hist + hist[..., :1]).min(axis=0)
 64 |         gain[..., 1] *= cp.minimum(counts - counts[..., :1:], counts[..., -1:] - counts + counts[..., :1]) >= params[
 65 |             'min_data_in_leaf']
 66 | 
 67 |         best_feat, best_gain, best_split, best_nan_left = get_best_split(gain, col_indexer)
 68 | 
 69 |         # move to CPU and apply min_gain_to_split condition
 70 |         unique_nodes, new_nodes_id, best_feat, best_gain, best_split, best_nan_left, is_valid_node = \
 71 |             get_cpu_splitters(unique_nodes, best_feat, best_gain, best_split, best_nan_left,
 72 |                               params['min_gain_to_split'])
 73 |         # if all nodes are not valid to split - exit
 74 |         if len(unique_nodes) == 0:
 75 |             break
 76 |         # write node info to the Tree
 77 |         tree.set_nodes(group, unique_nodes, new_nodes_id, best_feat, best_gain, best_split, best_nan_left)
 78 |         # get args back on gpu
 79 |         split_args, unique_nodes = get_gpu_splitters(unique_nodes, new_nodes_id,
 80 |                                                      best_feat, best_split, best_nan_left)
 81 | 
 82 |         # perform split for train set
 83 |         nodes, node_indexes = make_split(nodes, arr, *split_args, return_pos=True)
 84 | 
 85 |         # update info for the next step
 86 |         if niter < (params['max_depth'] - 1):
 87 |             # update counts
 88 |             nodes_count = cp.zeros((unique_nodes.shape[0] + 1,), dtype=np.uint64)
 89 |             nodes_count.scatter_add(node_indexes[row_indexer], 1)
 90 |             nodes_count = nodes_count[:-1]
 91 | 
 92 |             cpu_counts = nodes_count.get()
 93 | 
 94 |             # remove unused rows from indexer
 95 |             if cpu_counts.sum() < row_indexer.shape[0]:
 96 |                 row_indexer = row_indexer[isin(nodes, split_args[1].ravel(), index=row_indexer)]
 97 | 
 98 |             # save histogram for the subs trick
 99 |             prev_hist, small_index, big_index = get_prev_hist(cpu_counts,
100 |                                                               gh_hist, cp.asarray(is_valid_node))
101 | 
102 |     return nodes
103 | 
104 | 
105 | class ClusterTreeBuilder:
106 |     """Tree builder for early stopping clusters"""
107 | 
108 |     def __init__(self, borders,
109 |                  **tree_params
110 |                  ):
111 |         """
112 | 
113 |         Args:
114 |             borders: list of np.ndarray, actual split borders for quantized features
115 |             **tree_params: other tree building parameters
116 |         """
117 |         self.borders = borders
118 | 
119 |         self.params = {**{
120 | 
121 |             'max_bin': 256,
122 |             'max_depth': 6,
123 |             'min_data_in_leaf': 10,
124 |             'min_gain_to_split': 0
125 | 
126 |         }, **tree_params}
127 | 
128 |     def build_tree(self, X, y):
129 |         """Build tree
130 | 
131 |         Args:
132 |             X: cp.ndarray, quantized feature matrix
133 |             y: cp.ndarray, loss path matrix
134 | 
135 | 
136 |         Returns:
137 |             tree, Tree, constructed tree
138 |         """
139 | 
140 |         col_indexer = cp.arange(X.shape[1], dtype=cp.uint64)
141 |         row_indexer = cp.arange(X.shape[0], dtype=cp.uint64)
142 |         max_nodes = int((2 ** np.arange(self.params['max_depth'] + 1)).sum())
143 |         tree = Tree(max_nodes, y.shape[1], 1)
144 |         # grow single group of the tree and get nodes index
145 |         cluster_grow_tree(tree, 0, X, y, cp.ones((y.shape[0], 1), dtype=cp.float32),
146 |                           row_indexer, col_indexer, self.params)
147 | 
148 |         tree.set_borders(self.borders)
149 |         tree.set_leaves()
150 |         tree.set_node_values(np.zeros((max_nodes, 1), dtype=np.float32), np.zeros((1,), dtype=np.uint64))
151 | 
152 |         return tree
153 | 
154 | 
155 | class ClusterCandidates(Ensemble):
156 |     """
157 |     Ensemble of cluster candidates
158 |     """
159 | 
160 |     def __init__(self, depth_range=range(1, 7), min_data_in_leaf=100, debug=False):
161 |         super().__init__()
162 | 
163 |         self._debug = debug
164 |         self.depth_range = depth_range
165 |         self.min_data_in_leaf = min_data_in_leaf
166 |         self.max_clust = 2 ** max(depth_range)
167 | 
168 |     def fit(self, X, y):
169 |         X, y, sample_weight, eval_sets = validate_input(X, y, None, [])
170 |         mempool = cp.cuda.MemoryPool()
171 |         with cp.cuda.using_allocator(allocator=mempool.malloc):
172 |             # TODO: move quantizer to the Ensemble
173 |             quantizer = QuantileQuantizer(sample=self.quant_sample, max_bin=self.max_bin)
174 |             X_enc, max_bin, borders, eval_enc = self.quantize(X, eval_sets)
175 | 
176 |             self.fit_quantized(X_enc, y, max_bin, borders)
177 |         mempool.free_all_blocks()
178 | 
179 |         return self
180 | 
181 |     def fit_quantized(self, X_enc, y, max_bin, borders):
182 |         y = cp.array(y, order='C', dtype=cp.float32)
183 |         X_cp = pad_and_move(X_enc)
184 |         self.models = []
185 | 
186 |         for d in self.depth_range:
187 |             builder = ClusterTreeBuilder(borders, max_depth=d, min_data_in_leaf=self.min_data_in_leaf, max_bin=max_bin)
188 | 
189 |             tree = builder.build_tree(X_cp, y)
190 |             tree.reformat(nfeats=X_cp.shape[1], debug=self._debug)
191 |             self.models.append(tree)
192 | 
193 |         self.base_score = np.zeros((1,), dtype=np.float32)
194 | 
195 |         return self
196 | 
197 |     def predict(self, X, iterations=None, batch_size=100000):
198 |         return self.predict_leaves(X, iterations=iterations, batch_size=batch_size)[..., 0].T
199 | 


--------------------------------------------------------------------------------
/py_boost/gpu/__init__.py:
--------------------------------------------------------------------------------
1 | """Contains the core functions and classes"""
2 | 
3 | from .boosting import GradientBoosting
4 | 
5 | # __all__ = ['GradientBoosting']
6 | 


--------------------------------------------------------------------------------
/py_boost/gpu/boosting.py:
--------------------------------------------------------------------------------
  1 | """Gradient boosting builder"""
  2 | 
  3 | try:
  4 |     import cupy as cp
  5 | except Exception:
  6 |     pass
  7 | from .base import Ensemble
  8 | from .losses import loss_alias
  9 | from .tree import DepthwiseTreeBuilder
 10 | from .utils import pad_and_move, validate_input
 11 | from ..callbacks.callback import EarlyStopping, EvalHistory, CallbackPipeline
 12 | from ..multioutput.sketching import GradSketch
 13 | from ..multioutput.target_splitter import SingleSplitter, OneVsAllSplitter
 14 | from ..sampling.bagging import BaseSampler
 15 | 
 16 | 
 17 | class GradientBoosting(Ensemble):
 18 |     """Basic Gradient Boosting on depthwise trees"""
 19 | 
 20 |     def __init__(self, loss,
 21 |                  metric=None,
 22 |                  ntrees=100,
 23 |                  lr=0.05,
 24 |                  min_gain_to_split=0,
 25 |                  lambda_l2=1,
 26 |                  gd_steps=1,
 27 | 
 28 |                  max_depth=6,
 29 |                  min_data_in_leaf=10,
 30 |                  colsample=1.,
 31 |                  subsample=1.,
 32 |                  target_splitter='Single',
 33 |                  multioutput_sketch=None,
 34 |                  use_hess=True,
 35 | 
 36 |                  quantization='Quantile',
 37 |                  quant_sample=2000000,
 38 |                  max_bin=256,
 39 |                  min_data_in_bin=3,
 40 | 
 41 |                  es=100,
 42 |                  seed=42,
 43 |                  verbose=10,
 44 |                  callbacks=None,
 45 | 
 46 |                  debug=False
 47 |                  ):
 48 |         """
 49 | 
 50 |         Args:
 51 |             loss: str or Loss, loss function
 52 |             metric: None or str or Metric, metric
 53 |             ntrees: int, maximum number of trees
 54 |             lr: float, learning rate
 55 |             min_gain_to_split: float >=0, minimal gain to split
 56 |             lambda_l2: float > 0, l2 leaf regularization
 57 |             gd_steps: int > 0, number of gradient steps while computing leaf values
 58 | 
 59 |             max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets
 60 |             min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated
 61 |                 with hessian values to speed up training
 62 |             colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling
 63 |             subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling
 64 |             quant_sample: int, subsample to quantize features
 65 |             target_splitter: str or Callable, target splitter, defined multioutput strategy:
 66 |                 'Single', 'OneVsAll' or custom
 67 |             multioutput_sketch: None or Callable. Defines the sketching strategy to simplify scoring function
 68 |                 in multioutput case. If None full scoring function is used
 69 |             use_hess: If True hessians will be used in tree structure search
 70 | 
 71 |             quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform',
 72 |                 'Uniquant' or custom implementation
 73 |             quant_sample: int, subsample to quantize features
 74 |             max_bin: int in [2, 256] maximum number of bins to quantize features
 75 |             min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored
 76 | 
 77 |             es: int, early stopping rounds. If 0, no early stopping
 78 |             seed: int, random state
 79 |             verbose: int, verbosity freq
 80 |             callbacks: list of Callback, callbacks to customize training are passed here
 81 | 
 82 |             debug: bool, if debug mode is enabled (it removes ability to use deprecated functions,
 83 |                          thus optimizing memory usage)
 84 |         """
 85 | 
 86 |         super().__init__()
 87 | 
 88 |         self.params = {
 89 | 
 90 |             'loss': loss,
 91 |             'metric': metric,
 92 |             'ntrees': ntrees,
 93 |             'lr': lr,
 94 |             'min_gain_to_split': min_gain_to_split,
 95 |             'lambda_l2': lambda_l2,
 96 |             'gd_steps': gd_steps,
 97 | 
 98 |             'max_depth': max_depth,
 99 |             'min_data_in_leaf': min_data_in_leaf,
100 |             'colsample': colsample,
101 |             'subsample': subsample,
102 | 
103 |             'target_splitter': target_splitter,
104 |             'multioutput_sketch': multioutput_sketch,
105 |             'use_hess': use_hess,
106 | 
107 |             'quantization': quantization,
108 |             'quant_sample': quant_sample,
109 |             'max_bin': max_bin,
110 |             'min_data_in_bin': min_data_in_bin,
111 | 
112 |             'es': es,
113 |             'seed': seed,
114 |             'verbose': verbose,
115 |             'callbacks': callbacks,
116 | 
117 |             'debug': debug
118 |         }
119 | 
120 |     def _infer_params(self):
121 | 
122 |         self.ntrees = self.params['ntrees']
123 |         self.lr = self.params['lr']
124 | 
125 |         assert self.params['min_gain_to_split'] >= 0, 'Param min_gain_to_split should be >= 0'
126 | 
127 |         self.min_gain_to_split = self.params['min_gain_to_split']
128 |         self.lambda_l2 = self.params['lambda_l2']
129 |         self.gd_steps = self.params['gd_steps']
130 | 
131 |         self.max_depth = self.params['max_depth']
132 |         self.min_data_in_leaf = self.params['min_data_in_leaf']
133 |         self.use_hess = self.params['use_hess']
134 | 
135 |         self.colsample = self.params['colsample']
136 |         if type(self.params['colsample']) in [float, int]:
137 |             self.colsample = BaseSampler(self.params['colsample'], axis=1)
138 | 
139 |         self.subsample = self.params['subsample']
140 |         if type(self.params['subsample']) in [float, int]:
141 |             self.subsample = BaseSampler(self.params['subsample'], axis=0)
142 | 
143 |         if self.params['target_splitter'] == 'Single':
144 |             splitter = SingleSplitter()
145 |         elif self.params['target_splitter'] == 'OneVsAll':
146 |             splitter = OneVsAllSplitter()
147 |         else:
148 |             splitter = self.params['target_splitter']
149 | 
150 |         self.target_splitter = splitter
151 | 
152 |         self.multioutput_sketch = self.params['multioutput_sketch']
153 |         if self.params['multioutput_sketch'] is None:
154 |             self.multioutput_sketch = GradSketch()
155 | 
156 |         self.quantization = self.params['quantization']
157 |         self.quant_sample = self.params['quant_sample']
158 |         self.max_bin = self.params['max_bin']
159 |         self.min_data_in_bin = self.params['min_data_in_bin']
160 | 
161 |         self.es = self.params['es']
162 |         self.verbose = self.params['verbose']
163 | 
164 |         self.loss = self.params['loss']
165 |         if type(self.params['loss']) is str:
166 |             self.loss = loss_alias[self.params['loss']]
167 | 
168 |         self.postprocess_fn = self.loss.postprocess_output
169 | 
170 |         self.metric = self.params['metric']
171 |         if self.params['metric'] is None or type(self.params['metric']) is str:
172 |             self.metric = self.loss.get_metric_from_string(self.params['metric'])
173 |         self.seed = self.params['seed']
174 | 
175 |         self.history = []
176 | 
177 |         self.callbacks = CallbackPipeline(
178 | 
179 |             self.subsample,
180 |             self.colsample,
181 |             self.target_splitter,
182 |             self.multioutput_sketch,
183 |             *([] if self.params['callbacks'] is None else self.params['callbacks']),
184 |             EvalHistory(self.history, verbose=self.params['verbose']),
185 |             EarlyStopping(self.params['es']),
186 |         )
187 | 
188 |     def _fit(self, builder, build_info):
189 |         """Fit with tree builder and build info
190 | 
191 |         Args:
192 |             builder: DepthwiseTreeBuilder
193 |             build_info: build info state dict
194 | 
195 |         Returns:
196 | 
197 |         """
198 |         train, valid = build_info['data']['train'], build_info['data']['valid']
199 |         self.callbacks.before_train(build_info)
200 | 
201 |         for i in range(self.ntrees):
202 | 
203 |             build_info['num_iter'] = i
204 |             train['grad'], train['hess'] = self.loss(train['target'], train['ensemble'])
205 | 
206 |             self.callbacks.before_iteration(build_info)
207 | 
208 |             tree, leaves, preds, val_leaves, val_preds = \
209 |                 builder.build_tree(train['features_gpu'],
210 |                                    train['grad'],
211 |                                    train['hess'],
212 |                                    train['sample_weight'],
213 |                                    lambda x: self.loss(train['target'], train['ensemble'] + x),
214 |                                    *valid['features_gpu'])
215 | 
216 |             # update ensemble
217 |             train['ensemble'] += preds
218 |             for vp, tp in zip(valid['ensemble'], val_preds):
219 |                 vp += tp
220 | 
221 |             train['last_tree'] = {
222 | 
223 |                 'leaves': leaves,
224 |                 'preds': preds
225 | 
226 |             }
227 |             valid['last_tree'] = {
228 | 
229 |                 'leaves': val_leaves,
230 |                 'preds': val_preds
231 | 
232 |             }
233 | 
234 |             self.models.append(tree)
235 |             # check exit info
236 |             if self.callbacks.after_iteration(build_info):
237 |                 tree.reformat(nfeats=self.nfeats, debug=self.params['debug'])
238 |                 break
239 |             tree.reformat(nfeats=self.nfeats, debug=self.params['debug'])
240 | 
241 |         self.callbacks.after_train(build_info)
242 |         self.base_score = self.base_score.get()
243 | 
244 |     def fit(self, X, y, sample_weight=None, eval_sets=None):
245 |         """Fit model
246 | 
247 |         Args:
248 |             X: np.ndarray feature matrix
249 |             y: np.ndarray of target
250 |             sample_weight: np.ndarray of sample weights
251 |             eval_sets: list of dict of eval sets. Ex [{'X':X0, 'y': y0, 'sample_weight': w0}, ...}]
252 | 
253 |         Returns:
254 |             trained instance
255 |         """
256 |         self._infer_params()
257 | 
258 |         X, y, sample_weight, eval_sets = validate_input(X, y, sample_weight, eval_sets)
259 |         # fit and free memory
260 |         mempool = cp.cuda.MemoryPool()
261 |         with cp.cuda.using_allocator(allocator=mempool.malloc):
262 |             # quantize
263 |             X_enc, max_bin, borders, eval_enc = self.quantize(X, eval_sets)
264 |             # create build info
265 |             builder, build_info = self._create_build_info(mempool, X, X_enc, y, sample_weight,
266 |                                                           max_bin, borders, eval_sets, eval_enc)
267 |             self._fit(builder, build_info)
268 |         mempool.free_all_blocks()
269 | 
270 |         return self
271 | 
272 |     def _create_build_info(self, mempool, X, X_enc, y, sample_weight, max_bin, borders, eval_sets, eval_enc):
273 |         """Quantize data, create tree builder and build_info
274 | 
275 |         Args:
276 |             mempool: cp.cuda.MemoryPool, memory pool to use
277 |             X: np.ndarray feature matrix
278 |             y: np.ndarray of target
279 |             sample_weight: np.ndarray of sample weights
280 |             eval_sets: list of dict of eval sets. Ex [{'X':X0, 'y': y0, 'sample_weight': w0}, ...}]
281 | 
282 |         Returns:
283 |             DepthwiseTreeBuilder, build_info
284 |         """
285 |         # quantization
286 | 
287 |         y = cp.array(y, order='C', dtype=cp.float32)
288 | 
289 |         if sample_weight is not None:
290 |             sample_weight = cp.array(sample_weight, order='C', dtype=cp.float32)
291 | 
292 |         X_cp = pad_and_move(X_enc)
293 | 
294 |         X_val = [cp.array(x, order='C') for x in eval_enc]
295 |         y_val = [cp.array(x['y'], order='C', dtype=cp.float32) for x in eval_sets]
296 |         w_val = [None if x['sample_weight'] is None else cp.array(x['sample_weight'], order='C', dtype=cp.float32)
297 |                  for x in eval_sets]
298 | 
299 |         # save nfeatures for the feature importances
300 |         self.nfeats = X.shape[1]
301 | 
302 |         builder = DepthwiseTreeBuilder(borders,
303 |                                        use_hess=self.use_hess,
304 |                                        colsampler=self.colsample,
305 |                                        subsampler=self.subsample,
306 |                                        target_splitter=self.target_splitter,
307 |                                        multioutput_sketch=self.multioutput_sketch,
308 |                                        gd_steps=self.gd_steps,
309 |                                        lr=self.lr,
310 |                                        min_gain_to_split=self.min_gain_to_split,
311 |                                        min_data_in_leaf=self.min_data_in_leaf,
312 |                                        lambda_l2=self.lambda_l2,
313 |                                        max_depth=self.max_depth,
314 |                                        max_bin=max_bin,
315 |                                        )
316 |         cp.random.seed(self.seed)
317 | 
318 |         y = self.loss.preprocess_input(y)
319 |         y_val = [self.loss.preprocess_input(x) for x in y_val]
320 |         self.base_score = self.loss.base_score(y)
321 | 
322 |         # init ensembles
323 |         ens = cp.empty((y.shape[0], self.base_score.shape[0]), order='C', dtype=cp.float32)
324 |         ens[:] = self.base_score
325 |         # init val ens
326 |         val_ens = [cp.empty((x.shape[0], self.base_score.shape[0]), order='C') for x in y_val]
327 |         for ve in val_ens:
328 |             ve[:] = self.base_score
329 | 
330 |         self.models = []
331 | 
332 |         build_info = {
333 |             'data': {
334 |                 'train': {
335 |                     'features_cpu': X,
336 |                     'features_gpu': X_cp,
337 |                     'target': y,
338 |                     'sample_weight': sample_weight,
339 |                     'ensemble': ens,
340 |                     'grad': None,
341 |                     'hess': None
342 |                 },
343 |                 'valid': {
344 |                     'features_cpu': [dat['X'] for dat in eval_sets],
345 |                     'features_gpu': X_val,
346 |                     'target': y_val,
347 |                     'sample_weight': w_val,
348 |                     'ensemble': val_ens,
349 |                 }
350 |             },
351 |             'borders': borders,
352 |             'model': self,
353 |             'mempool': mempool,
354 |             'builder': builder
355 |         }
356 | 
357 |         return builder, build_info
358 | 
359 |     def load(self, file):
360 |         """Load weights fromm file
361 | 
362 |         Args:
363 |             file: str, file path
364 | 
365 |         Returns:
366 |             Py-Boost GradientBoosting
367 |         """
368 |         self._infer_params()
369 | 
370 |         return super().load(file)
371 | 


--------------------------------------------------------------------------------
/py_boost/gpu/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .losses import *
 2 | from .metrics import *
 3 | from .multiclass_metrics import *
 4 | 
 5 | __all__ = [
 6 | 
 7 |     'loss_alias',
 8 |     'Loss',
 9 |     'MSELoss',
10 |     'MSLELoss',
11 |     'BCELoss',
12 |     'CrossEntropyLoss',
13 | 
14 |     'Metric',
15 |     'RMSEMetric',
16 |     'RMSLEMetric',
17 |     'R2Score',
18 |     'BCEMetric',
19 |     'AccuracyMetric',
20 |     'RocAucMetric',
21 | 
22 |     'Precision',
23 |     'Recall',
24 |     'F1Score',
25 | 
26 |     'MultiAccuracyMetric',
27 |     'MultiPrecision',
28 |     'MultiRecall',
29 |     'MultiF1Score'
30 | 
31 | ]
32 | 


--------------------------------------------------------------------------------
/py_boost/gpu/losses/losses.py:
--------------------------------------------------------------------------------
  1 | """Common losses"""
  2 | 
  3 | import numpy as np
  4 | try:
  5 |     import cupy as cp
  6 |     CUDA_FOUND = True
  7 | except Exception:
  8 |     CUDA_FOUND = False
  9 | 
 10 | from .metrics import metric_alias, RMSEMetric, RMSLEMetric, BCEMetric
 11 | from .multiclass_metrics import multiclass_metric_alias, CrossEntropyMetric
 12 | 
 13 | 
 14 | class Loss:
 15 |     """Base class to define loss function"""
 16 | 
 17 |     def get_grad_hess(self, y_true, y_pred):
 18 |         """Method implements how to calculate gradients and hessians.
 19 |         Output gradient should have the shape (n_samples, n_outputs)
 20 |         Output hessian should have the shape (n_samples, n_outputs) or (n_samples, 1)
 21 |             if the same hess used for all outputs (for ex. MSELoss)
 22 | 
 23 |         Definition don't use sample_weight, because it is applied later at the tree building stage
 24 | 
 25 |         Args:
 26 |             y_true: cp.ndarray, target
 27 |             y_pred: cp.ndarray, current prediction
 28 | 
 29 |         Returns:
 30 | 
 31 |         """
 32 |         raise NotImplementedError
 33 | 
 34 |     def __call__(self, y_true, y_pred):
 35 |         grad, hess = self.get_grad_hess(y_true, y_pred)
 36 |         return grad, hess
 37 | 
 38 |     def preprocess_input(self, y_true):
 39 |         """Method defines how raw input target variable should be processed before train starts
 40 |             (ex. applying log1p for MSLELoss)
 41 | 
 42 |         Args:
 43 |             y_true: cp.ndarray, raw target
 44 | 
 45 |         Returns:
 46 | 
 47 |         """
 48 |         return y_true
 49 | 
 50 |     def postprocess_output(self, y_pred):
 51 |         """Method defines how to postprocess sum of trees to output prediction (ex. apply sigmoid for BCELoss)
 52 | 
 53 |         Args:
 54 |             y_pred: cp.ndarray, predictions
 55 | 
 56 |         Returns:
 57 | 
 58 |         """
 59 |         return y_pred
 60 | 
 61 |     def base_score(self, y_true):
 62 |         """Method defines how to initialize an empty ensemble (ex. initialize with mean values for MSELoss)
 63 | 
 64 |         Args:
 65 |             y_true: cp.ndarray, processed target (after applying preprocess_input)
 66 | 
 67 |         Returns:
 68 | 
 69 |         """
 70 |         raise NotImplementedError
 71 | 
 72 |     def get_metric_from_string(self, name=None):
 73 |         """Method defines how to interpret eval metric given in str format or None.
 74 |         For ex. you can define the default metric to use here if name is None
 75 | 
 76 |         Args:
 77 |             name:
 78 | 
 79 |         Returns:
 80 | 
 81 |         """
 82 |         return metric_alias['name']
 83 | 
 84 | 
 85 | class MSELoss(Loss):
 86 |     """MSE Loss function for regression/multiregression"""
 87 | 
 88 |     def get_grad_hess(self, y_true, y_pred):
 89 |         return (y_pred - y_true), cp.ones((y_true.shape[0], 1), dtype=cp.float32)
 90 | 
 91 |     def base_score(self, y_true):
 92 |         return y_true.mean(axis=0)
 93 | 
 94 |     def get_metric_from_string(self, name=None):
 95 |         if name is None:
 96 |             return RMSEMetric()
 97 |         return metric_alias[name]
 98 | 
 99 | 
100 | class MSLELoss(Loss):
101 |     """MSLE Loss function for regression/multiregression"""
102 | 
103 |     def preprocess_input(self, y_true):
104 |         assert (y_true >= 0).all(), 'Inputs for msle should be non negative'
105 | 
106 |         return y_true
107 | 
108 |     def get_grad_hess(self, y_true, y_pred):
109 |         y_true = cp.log1p(y_true)
110 | 
111 |         return (y_pred - y_true), cp.ones((y_true.shape[0], 1), dtype=cp.float32)
112 | 
113 |     def postprocess_output(self, y_pred):
114 |         return cp.expm1(y_pred)
115 | 
116 |     def base_score(self, y_true):
117 |         y_true = cp.log1p(y_true)
118 |         return y_true.mean(axis=0)
119 | 
120 |     def get_metric_from_string(self, name=None):
121 |         if name is None:
122 |             return RMSLEMetric()
123 |         return metric_alias[name]
124 | 
125 | 
126 | class BCELoss(Loss):
127 |     """LogLoss for binary/multilabel classification"""
128 | 
129 |     def __init__(self, clip_value=1e-7):
130 |         self.clip_value = clip_value
131 | 
132 |     def base_score(self, y_true):
133 |         means = cp.clip(y_true.mean(axis=0), self.clip_value, 1 - self.clip_value)
134 |         return cp.log(means / (1 - means))
135 | 
136 |     def get_grad_hess(self, y_true, y_pred):
137 |         pred = 1 / (1 + cp.exp(-y_pred))
138 |         pred = cp.clip(pred, self.clip_value, 1 - self.clip_value)
139 |         grad = pred - y_true
140 |         hess = pred * (1 - pred)
141 | 
142 |         return grad, hess
143 | 
144 |     def postprocess_output(self, y_pred):
145 |         xp = np if type(y_pred) is np.ndarray else cp
146 |         pred = 1 / (1 + xp.exp(-y_pred))
147 |         pred = xp.clip(pred, self.clip_value, 1 - self.clip_value)
148 | 
149 |         return pred
150 | 
151 |     def get_metric_from_string(self, name=None):
152 |         if name is None:
153 |             return BCEMetric()
154 |         return metric_alias[name]
155 | 
156 | 
157 | def softmax(x, clip_val=1e-5):
158 |     
159 |     xp = np if type(x) is np.ndarray else cp
160 |     exp_p = xp.exp(x - x.max(axis=1, keepdims=True))
161 | 
162 |     return xp.clip(exp_p / exp_p.sum(axis=1, keepdims=True), clip_val, 1 - clip_val)
163 | 
164 | 
165 | # multiclass losses
166 | 
167 | ce_grad_kernel = cp.ElementwiseKernel(
168 |     'T pred, raw S label, raw S nlabels, T factor',
169 |     'T grad, T hess',
170 | 
171 |     """
172 |     int y_pr = i % nlabels;
173 |     int y_tr = label[i / nlabels];
174 | 
175 |     grad = pred - (float) (y_pr == y_tr);
176 |     hess = pred * (1. - pred) * factor;
177 | 
178 |     """,
179 |     "ce_grad_kernel"
180 | ) if CUDA_FOUND else None
181 | 
182 | 
183 | def ce_grad(y_true, y_pred):
184 |     factor = y_pred.shape[1] / (y_pred.shape[1] - 1)
185 |     grad, hess = ce_grad_kernel(y_pred, y_true, y_pred.shape[1], factor)
186 | 
187 |     return grad, hess
188 | 
189 | 
190 | class CrossEntropyLoss(Loss):
191 |     """CrossEntropy for multiclass classification"""
192 | 
193 |     def __init__(self, clip_value=1e-6):
194 |         self.clip_value = clip_value
195 | 
196 |     def base_score(self, y_true):
197 |         num_classes = int(y_true.max() + 1)
198 |         hist = cp.zeros((num_classes,), dtype=cp.float32)
199 | 
200 |         return hist
201 | 
202 |     def get_grad_hess(self, y_true, y_pred):
203 |         pred = softmax(y_pred, self.clip_value)
204 |         grad, hess = ce_grad(y_true, pred)
205 |         return grad, hess
206 | 
207 |     def postprocess_output(self, y_pred):
208 |         
209 |         return softmax(y_pred, self.clip_value)
210 | 
211 |     def preprocess_input(self, y_true):
212 |         return y_true[:, 0].astype(cp.int32)
213 | 
214 |     def get_metric_from_string(self, name=None):
215 |         if name is None:
216 |             return CrossEntropyMetric()
217 |         return multiclass_metric_alias[name]
218 | 
219 | 
220 | loss_alias = {
221 | 
222 |     # for bce
223 |     'binary': BCELoss(),
224 |     'bce': BCELoss(),
225 |     'multilabel': BCELoss(),
226 |     'logloss': BCELoss(),
227 | 
228 |     # for multiclass
229 |     'multiclass': CrossEntropyLoss(),
230 |     'crossentropy': CrossEntropyLoss(),
231 | 
232 |     # for regression
233 |     'mse': MSELoss(),
234 |     'regression': MSELoss(),
235 |     'l2': MSELoss(),
236 |     'multitask': MSELoss(),
237 |     'msle': MSLELoss()
238 | 
239 | }
240 | 


--------------------------------------------------------------------------------
/py_boost/gpu/losses/metrics.py:
--------------------------------------------------------------------------------
  1 | """Common metrics"""
  2 | 
  3 | try:
  4 |     import cupy as cp
  5 | except Exception:
  6 |     pass
  7 | 
  8 | 
  9 | class Metric:
 10 |     """Base class to define eval metric function.
 11 |     Metric could be defined in 2 ways:
 12 | 
 13 |         - redefine .error method. Preferred if possible. Simplified metric definition by calculating error function
 14 |             for each point (ex. see RMSEMetric). If metric is defined via .error it also could be used with AdvancedES
 15 | 
 16 |         - redefine __call__ method. Used for more complex functions, like ROC-AUC. Handling sample_weight
 17 |             should be done manually here if needed
 18 | 
 19 | 
 20 |     """
 21 |     alias = 'score'  # defines how metric will be named in the output log
 22 | 
 23 |     def error(self, y_true, y_pred):
 24 |         """Simplified metric definition via individual objects error
 25 | 
 26 |         Args:
 27 |             y_true: cp.array, targets
 28 |             y_pred: cp.array, predictions
 29 | 
 30 |         Returns:
 31 |             float, metric value
 32 |         """
 33 |         raise ValueError('Pointwise error is not implemented for this metric')
 34 | 
 35 |     def __call__(self, y_true, y_pred, sample_weight=None):
 36 |         """Full metric definition. Default is just weighted aggregation of pointwise errors
 37 | 
 38 |         Args:
 39 |             y_true: cp.array, targets
 40 |             y_pred: cp.array, predictions
 41 |             sample_weight: None or cp.ndarray, weights
 42 | 
 43 |         Returns:
 44 |             float, metric value
 45 |         """
 46 |         err = self.error(y_true, y_pred)
 47 |         shape = err.shape
 48 |         assert shape[0] == y_true.shape[0], 'Error shape should match target shape at first dim'
 49 | 
 50 |         if len(shape) == 1:
 51 |             err = err[:, cp.newaxis]
 52 | 
 53 |         if sample_weight is None:
 54 |             return err.mean()
 55 | 
 56 |         err = (err.mean(axis=1, keepdims=True) * sample_weight).sum() / sample_weight.sum()
 57 |         return err
 58 | 
 59 |     def compare(self, v0, v1):
 60 |         """Method defines how to decide if metric was improved. Commonly it should be one of 'v0 > v1' or 'v0 < v1 '
 61 | 
 62 |         Args:
 63 |             v0: float, metric value
 64 |             v1: float, metric value
 65 | 
 66 |         Returns:
 67 |             bool, if v0 improves score against v1
 68 |         """
 69 |         raise NotImplementedError
 70 | 
 71 |     def argmax(self, arr):
 72 |         """Select best metric from list of metrics based on .compare method
 73 | 
 74 |         Args:
 75 |             arr: list of float, metric values
 76 | 
 77 |         Returns:
 78 |             int, position of the best metric value
 79 |         """
 80 |         best = arr[0]
 81 |         best_n = 0
 82 | 
 83 |         for n, val in enumerate(arr[1:], 1):
 84 |             if self.compare(val, best):
 85 |                 best = val
 86 |                 best_n = n
 87 | 
 88 |         return best_n
 89 | 
 90 | 
 91 | class RMSEMetric(Metric):
 92 |     """RMSE Metric for the regression/multiregression task"""
 93 |     alias = 'rmse'
 94 | 
 95 |     def error(self, y_true, y_pred):
 96 |         return (y_true - y_pred) ** 2
 97 | 
 98 |     def __call__(self, y_true, y_pred, sample_weight=None):
 99 |         return super().__call__(y_true, y_pred, sample_weight) ** .5
100 | 
101 |     def compare(self, v0, v1):
102 |         return v0 < v1
103 | 
104 | 
105 | class R2Score(RMSEMetric):
106 |     """R2 Score Metric for the regression/multiregression task"""
107 |     alias = 'R2_score'
108 | 
109 |     def __call__(self, y_true, y_pred, sample_weight=None):
110 | 
111 |         if sample_weight is not None:
112 |             err = ((y_true - y_pred) ** 2 * sample_weight).sum(axis=0) / sample_weight.sum()
113 |             std = ((y_true - y_true.mean(axis=0)) ** 2 * sample_weight).sum(axis=0) / sample_weight.sum()
114 |         else:
115 |             err = ((y_true - y_pred) ** 2).mean(axis=0)
116 |             std = y_true.var(axis=0)
117 | 
118 |         return (1 - err / std).mean()
119 | 
120 |     def compare(self, v0, v1):
121 |         return v0 > v1
122 | 
123 | 
124 | class RMSLEMetric(RMSEMetric):
125 |     """RMSLE Metric for the regression/multiregression classification task"""
126 |     alias = 'rmsle'
127 | 
128 |     def error(self, y_true, y_pred):
129 |         return super().error(cp.log1p(y_true), cp.log1p(y_pred))
130 | 
131 | 
132 | class BCEMetric(Metric):
133 |     """LogLoss for the binary/multilabel classification task"""
134 |     alias = 'BCE'
135 | 
136 |     def error(self, y_true, y_pred):
137 |         return -cp.log(y_true * y_pred + (1 - y_pred) * (1 - y_true))
138 | 
139 |     def compare(self, v0, v1):
140 |         return v0 < v1
141 | 
142 | 
143 | def auc(y, x, sample_weight=None):
144 |     """Roc-auc score via cupy
145 | 
146 |     Args:
147 |         y: cp.ndarray, 1d prediction
148 |         x: cp.ndarray, 1d binary target
149 |         sample_weight: optional 1d array of sample weights
150 | 
151 |     Returns:
152 |         float, roc-auc metric value
153 |     """
154 |     unique_x = cp.unique(x)
155 | 
156 |     if unique_x.shape[0] <= 1:
157 |         return 0.5
158 | 
159 |     if sample_weight is None:
160 |         sample_weight = cp.ones_like(y)
161 | 
162 |     rank_x = cp.searchsorted(unique_x, x)
163 | 
164 |     sum_1 = cp.zeros_like(unique_x, dtype=cp.float64)
165 |     sum_1.scatter_add(rank_x, sample_weight * y)
166 | 
167 |     sum_0 = cp.zeros_like(unique_x, dtype=cp.float64)
168 |     sum_0.scatter_add(rank_x, sample_weight * (1 - y))
169 | 
170 |     cs_0 = sum_0.cumsum()
171 |     auc_ = (cs_0 - sum_0 / 2) * sum_1
172 | 
173 |     tot = cs_0[-1] * sum_1.sum()
174 | 
175 |     return float(auc_.sum() / tot)
176 | 
177 | 
178 | class RocAucMetric(Metric):
179 |     """Roc-auc metric for the binary classification task"""
180 |     alias = 'AUC'
181 | 
182 |     def __call__(self, y_true, y_pred, sample_weight=None):
183 |         """
184 | 
185 |         Args:
186 |             y_true: cp.ndarray of targets
187 |             y_pred: cp.ndarray of predictions
188 |             sample_weight: None or cp.ndarray of sample_weights
189 | 
190 |         Returns:
191 | 
192 |         """
193 |         assert y_pred.shape[1] == 1, 'Multioutput is not supported'
194 | 
195 |         if sample_weight is not None:
196 |             sample_weight = sample_weight[:, 0]
197 | 
198 |         return auc(y_true[:, 0], y_pred[:, 0], sample_weight)
199 | 
200 |     def compare(self, v0, v1):
201 |         return v0 > v1
202 | 
203 | 
204 | class ThresholdMetrics(Metric):
205 |     """Basic class to handle metrics, that accept class label prediction as input"""
206 | 
207 |     def __init__(self, threshold=0.5, q=None):
208 |         """Define binarization rule. If quantile is given, threshold defined with quantile
209 | 
210 |         Args:
211 |             threshold: float, threshold value
212 |             q: None or float, quantile threshold
213 |         """
214 |         self.threshold = threshold
215 |         self.q = q
216 | 
217 |     def get_label(self, y_pred):
218 |         """Get labels from probabilities
219 | 
220 |         Args:
221 |             y_pred: cp.ndarray, predictions
222 | 
223 |         Returns:
224 |             cp.ndarray, predicted class labels
225 |         """
226 |         threshold = self.threshold
227 |         if self.q is not None:
228 |             threshold = cp.quantile(y_pred, self.q, axis=0, interpolation='higher')
229 | 
230 |         return y_pred >= threshold
231 | 
232 |     def get_stats(self, y_true, y_pred, sample_weight=None, mode='f1'):
233 |         """Helpful utils to calc Precision/Recall/F1
234 | 
235 |         Args:
236 |             y_true: cp.ndarray, target
237 |             y_pred: cp.ndarray, predicted class label
238 |             sample_weight: None or cp.ndarray, weights
239 |             mode:
240 | 
241 |         Returns:
242 | 
243 |         """
244 | 
245 |         y_pred = self.get_label(y_pred)
246 |         true = y_pred == y_true
247 | 
248 |         tp = true * y_pred
249 |         if sample_weight is not None:
250 |             tp = tp * sample_weight
251 |         tp = tp.sum(axis=0)
252 | 
253 |         if mode == 'p':
254 |             if sample_weight is not None:
255 |                 return tp, (y_pred * sample_weight).sum(axis=0)
256 |             return tp, y_pred.sum(axis=0)
257 | 
258 |         if sample_weight is not None:
259 |             tot = (y_true * sample_weight).sum(axis=0)
260 |         else:
261 |             tot = y_true.sum(axis=0)
262 |         if mode == 'r':
263 |             return tp, tot
264 | 
265 |         if sample_weight is not None:
266 |             tot_p = (y_pred * sample_weight).sum(axis=0)
267 |         else:
268 |             tot_p = y_pred.sum(axis=0)
269 | 
270 |         return tp, tot, tot_p
271 | 
272 |     def compare(self, v0, v1):
273 |         return v0 > v1
274 | 
275 | 
276 | class AccuracyMetric(ThresholdMetrics):
277 |     """Accuracy Metric for the binary/multilabel classification task"""
278 |     alias = 'Accuracy'
279 | 
280 |     def error(self, y_true, y_pred):
281 |         y_pred = self.get_label(y_pred)
282 |         return (y_true == y_pred).mean(axis=1)
283 | 
284 | 
285 | class Precision(ThresholdMetrics):
286 |     """Precision Metric for the binary/multilabel classification task"""
287 |     alias = 'Precision'
288 | 
289 |     def __call__(self, y_true, y_pred, sample_weight=None):
290 |         tp, tot = self.get_stats(y_true, y_pred, sample_weight, mode='p')
291 |         tot = cp.clip(tot, 1e-5, None)
292 |         return (tp / tot).mean()
293 | 
294 | 
295 | class Recall(ThresholdMetrics):
296 |     """Recall Metric for the binary/multilabel classification task"""
297 |     alias = 'Recall'
298 | 
299 |     def __call__(self, y_true, y_pred, sample_weight=None):
300 |         tp, tot = self.get_stats(y_true, y_pred, sample_weight, mode='r')
301 |         tot = cp.clip(tot, 1e-5, None)
302 |         return (tp / tot).mean()
303 | 
304 | 
305 | class F1Score(ThresholdMetrics):
306 |     """F1 Score Metric for the binary/multilabel classification task"""
307 |     alias = 'F1_score'
308 | 
309 |     def __call__(self, y_true, y_pred, sample_weight=None):
310 |         tp, tot, tot_p = self.get_stats(y_true, y_pred, sample_weight, mode='f1')
311 |         precision = tp / cp.clip(tot_p, 1e-5, None)
312 |         recall = tp / cp.clip(tot, 1e-5, None)
313 | 
314 |         return (2 * (precision * recall) / cp.clip(precision + recall, 1e-5, None)).mean()
315 | 
316 | 
317 | metric_alias = {
318 | 
319 |     # for bce
320 |     'bce': BCEMetric(),
321 |     'logloss': BCEMetric(),
322 | 
323 |     'precision': Precision(),
324 |     'recall': Recall(),
325 |     'f1_score': F1Score(),
326 |     'f1': F1Score(),
327 | 
328 |     'accuracy': AccuracyMetric(),
329 |     'acc': AccuracyMetric(),
330 | 
331 |     'auc': RocAucMetric(),
332 |     'roc': RocAucMetric(),
333 | 
334 |     # for regression
335 |     'rmse': RMSEMetric(),
336 |     'l2': RMSEMetric(),
337 |     'rmsle': RMSLEMetric(),
338 |     'r2': R2Score(),
339 |     'r2_score': R2Score(),
340 | 
341 | }
342 | 


--------------------------------------------------------------------------------
/py_boost/gpu/losses/multiclass_metrics.py:
--------------------------------------------------------------------------------
  1 | """Common multiclass metrics"""
  2 | 
  3 | try:
  4 |     import cupy as cp
  5 | except Exception:
  6 |     pass
  7 | 
  8 | from .metrics import Metric, metric_alias
  9 | 
 10 | 
 11 | class CrossEntropyMetric(Metric):
 12 |     """CrossEntropy Metric for the multiclassification task"""
 13 |     alias = 'Crossentropy'
 14 | 
 15 |     def error(self, y_true, y_pred):
 16 |         return -cp.log(cp.take_along_axis(y_pred, y_true[:, cp.newaxis], axis=1))
 17 | 
 18 |     def compare(self, v0, v1):
 19 |         return v0 < v1
 20 | 
 21 | 
 22 | class MultiAccuracyMetric(Metric):
 23 |     """Accuracy Metric for the multiclassification task"""
 24 |     alias = 'Accuracy'
 25 | 
 26 |     def error(self, y_true, y_pred):
 27 |         cl_pred = y_pred.argmax(axis=1)
 28 |         return (cl_pred == y_true).astype(cp.float32)
 29 | 
 30 |     def compare(self, v0, v1):
 31 |         return v0 > v1
 32 | 
 33 | 
 34 | class MultiMetric(Metric):
 35 |     """Basic class to handle metrics, that accept class label prediction as input for the multiclassificationn task"""
 36 | 
 37 |     def __init__(self, average='macro'):
 38 |         """
 39 | 
 40 |         Args:
 41 |             average: str, one of 'micro' / 'macro'
 42 |         """
 43 |         self.average = average
 44 | 
 45 |     @staticmethod
 46 |     def get_stats(y_true, y_pred, sample_weight=None, mode='f1'):
 47 |         """Helpful utils to calc Precision/Recall/F1
 48 | 
 49 |         Args:
 50 |             y_true: cp.ndarray, target
 51 |             y_pred: cp.ndarray, predicted class label
 52 |             sample_weight: None or cp.ndarray, weights
 53 |             mode:
 54 | 
 55 |         Returns:
 56 | 
 57 |         """
 58 | 
 59 |         if sample_weight is None:
 60 |             sample_weight = cp.ones(y_true.shape, dtype=cp.float32)
 61 |         else:
 62 |             sample_weight = sample_weight[:, 0]
 63 | 
 64 |         cl_pred = y_pred.argmax(axis=1)
 65 |         true = y_true == cl_pred
 66 | 
 67 |         tp = cp.zeros(y_pred.shape[1], dtype=cp.float64)
 68 |         tp.scatter_add(cl_pred, true * sample_weight)
 69 | 
 70 |         tot = cp.zeros(y_pred.shape[1], dtype=cp.float64)
 71 |         if mode == 'p':
 72 |             tot.scatter_add(cl_pred, sample_weight)
 73 |             return tp, tot
 74 | 
 75 |         tot.scatter_add(y_true, sample_weight)
 76 |         if mode == 'r':
 77 |             return tp, tot
 78 | 
 79 |         tot_p = cp.zeros(y_pred.shape[1], dtype=cp.float64)
 80 |         tot_p.scatter_add(cl_pred, sample_weight)
 81 | 
 82 |         return tp, tot, tot_p
 83 | 
 84 |     def get_metric(self, tp, tot):
 85 | 
 86 |         tot = cp.clip(tot, 1e-5, None)
 87 | 
 88 |         if self.average == 'micro':
 89 |             return float(tp.sum() / tot.sum())
 90 | 
 91 |         return float((tp / tot).mean())
 92 | 
 93 |     def compare(self, v0, v1):
 94 |         return v0 > v1
 95 | 
 96 | 
 97 | class MultiPrecision(MultiMetric):
 98 |     """Precision Metric for the multiclassification classification task"""
 99 |     alias = 'Precision'
100 | 
101 |     def __call__(self, y_true, y_pred, sample_weight=None):
102 |         tp, tot = self.get_stats(y_true, y_pred, sample_weight=sample_weight, mode='p')
103 |         return self.get_metric(tp, tot)
104 | 
105 | 
106 | class MultiRecall(MultiMetric):
107 |     """Recall Metric for the multiclassification classification task"""
108 |     alias = 'Recall'
109 | 
110 |     def __call__(self, y_true, y_pred, sample_weight=None):
111 |         tp, tot = self.get_stats(y_true, y_pred, sample_weight=sample_weight, mode='r')
112 |         return self.get_metric(tp, tot)
113 | 
114 | 
115 | class MultiF1Score(MultiMetric):
116 |     """F1 Score Metric for the multiclassification classification task"""
117 |     alias = 'F1_score'
118 | 
119 |     def __call__(self, y_true, y_pred, sample_weight=None):
120 |         tp, tot, tot_p = self.get_stats(y_true, y_pred, sample_weight=sample_weight, mode='f1')
121 |         precision = self.get_metric(tp, tot_p)
122 |         recall = self.get_metric(tp, tot)
123 |         return 2 * (precision * recall) / (precision + recall)
124 | 
125 | 
126 | multiclass_metric_alias = {**metric_alias, **{
127 | 
128 |     'crossentropy': CrossEntropyMetric(),
129 | 
130 |     'precision': MultiPrecision(),
131 |     'recall': MultiRecall(),
132 |     'f1_score': MultiF1Score(),
133 |     'f1': MultiF1Score(),
134 | 
135 |     'accuracy': MultiAccuracyMetric(),
136 |     'acc': MultiAccuracyMetric(),
137 | 
138 | }}
139 | 


--------------------------------------------------------------------------------
/py_boost/gpu/serialization.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tqdm
  3 | import ujson as json
  4 | 
  5 | from .tree import Tree
  6 | 
  7 | 
  8 | def nested_float_cast(arr):
  9 |     if type(arr[0]) is list:
 10 |         for i in range(len(arr)):
 11 |             arr[i] = nested_float_cast(arr[i])
 12 | 
 13 |     else:
 14 |         for i in range(len(arr)):
 15 |             arr[i] = float(arr[i])
 16 | 
 17 |     return arr
 18 | 
 19 | 
 20 | def handle_float(arr):
 21 |     arr = arr.astype(str)
 22 |     arr = arr.tolist()
 23 |     arr = nested_float_cast(arr)
 24 | 
 25 |     return arr
 26 | 
 27 | 
 28 | def parse_tree(tree):
 29 |     """Parse single tree
 30 | 
 31 |     Args:
 32 |         tree: Py-Boost Tree
 33 | 
 34 |     Returns:
 35 |         dict
 36 |     """
 37 |     D = {}
 38 | 
 39 |     for key in tree.__dict__:
 40 | 
 41 |         value = tree.__dict__[key]
 42 |         if value is None:
 43 |             continue
 44 | 
 45 |         if type(value) is np.ndarray:
 46 | 
 47 |             if np.issubdtype(value.dtype, np.floating):
 48 |                 value = handle_float(value)
 49 |             else:
 50 |                 value = value.tolist()
 51 | 
 52 |         D[key] = value
 53 | 
 54 |     return D
 55 | 
 56 | 
 57 | def parse_model(model):
 58 |     """Parse model
 59 | 
 60 |     Args:
 61 |         model: Py-Boost Ensemble
 62 | 
 63 |     Returns:
 64 |         dict
 65 |     """
 66 |     model.to_cpu()
 67 | 
 68 |     D = {'base_score': handle_float(model.base_score)}
 69 | 
 70 |     for n, tree in enumerate(tqdm.tqdm(model.models)):
 71 |         D[n] = parse_tree(tree)
 72 | 
 73 |     return D
 74 | 
 75 | 
 76 | def dump(model, file):
 77 |     """Parse model and save the results
 78 | 
 79 |     Args:
 80 |         model: Py-Boost Ensemble
 81 |         file: str, path to file
 82 | 
 83 |     Returns:
 84 | 
 85 |     """
 86 |     with open(file, 'w') as f:
 87 |         json.dump(parse_model(model), f)
 88 | 
 89 |     return
 90 | 
 91 | 
 92 | attr_types = {
 93 | 
 94 |     'values': np.float32,
 95 |     'group_index': np.uint64,
 96 |     'feature_importance_gain': np.float32,
 97 |     'feature_importance_split': np.float32,
 98 |     'test_format': np.float32,
 99 |     'test_format_offsets': np.int32
100 | 
101 | }
102 | 
103 | 
104 | def load_tree(D):
105 |     """Create single tree from dict
106 | 
107 |     Args:
108 |         D: dict
109 | 
110 |     Returns:
111 |         Py-Boost Tree
112 |     """
113 |     tree = Tree(1, 1, 1)
114 |     # delete unused attrs
115 |     for key in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'leaves']:
116 |         setattr(tree, key, None)
117 | 
118 |     # set new attrs
119 |     for key in D:
120 |         value = D[key]
121 | 
122 |         if type(value) is list:
123 |             value = np.asarray(value)
124 | 
125 |         if key in attr_types:
126 |             value = value.astype(attr_types[key])
127 | 
128 |         setattr(tree, key, value)
129 | 
130 |     return tree
131 | 
132 | 
133 | def load_model(D, model):
134 |     """Update model data with dict values
135 | 
136 |     Args:
137 |         D: dict
138 |         model: Py-Boost Ensemble
139 | 
140 |     Returns:
141 |         Py-Boost Ensemble
142 |     """
143 |     model.base_score = np.asarray(D.pop('base_score')).astype(np.float32)
144 | 
145 |     trees = [None] * len(D)
146 | 
147 |     for key in D:
148 |         trees[int(key)] = load_tree(D[key])
149 | 
150 |     model.models = trees
151 | 
152 |     return model
153 | 
154 | 
155 | def load(model, file):
156 |     """Read data from json and update Py-Boost model data
157 | 
158 |     Args:
159 |         model: Py-Boost Ensemble
160 |         file: str, file path
161 | 
162 |     Returns:
163 |         Py-Boost Ensemble
164 |     """
165 |     with open(file, 'r') as f:
166 |         load_model(json.load(f), model)
167 | 
168 |     return model
169 | 


--------------------------------------------------------------------------------
/py_boost/gpu/sketch_boost.py:
--------------------------------------------------------------------------------
  1 | """Implements SketchBoost for multioutput class"""
  2 | 
  3 | from .boosting import GradientBoosting
  4 | from ..multioutput.sketching import FilterSketch, TopOutputsSketch, SVDSketch, RandomSamplingSketch, \
  5 |     RandomProjectionSketch
  6 | 
  7 | 
  8 | class SketchBoost(GradientBoosting):
  9 |     """
 10 |     Gradient Boosting with built-in FilterSketch to handle multioutput tasks. If single output is passed,
 11 |     it is handled as usual
 12 |     """
 13 | 
 14 |     def __init__(self, loss,
 15 |                  metric=None,
 16 |                  ntrees=100,
 17 |                  lr=0.05,
 18 |                  min_gain_to_split=0,
 19 |                  lambda_l2=1,
 20 |                  gd_steps=1,
 21 |                  max_depth=6,
 22 |                  min_data_in_leaf=10,
 23 |                  colsample=1.,
 24 |                  subsample=1.,
 25 | 
 26 |                  quantization='Quantile',
 27 |                  quant_sample=2000000,
 28 |                  max_bin=256,
 29 |                  min_data_in_bin=3,
 30 | 
 31 |                  es=100,
 32 |                  seed=42,
 33 |                  verbose=10,
 34 | 
 35 |                  sketch_outputs=1,
 36 |                  sketch_method='proj',
 37 |                  use_hess=False,
 38 | 
 39 |                  callbacks=None,
 40 |                  sketch_params=None
 41 |                  ):
 42 |         """
 43 | 
 44 |         Args:
 45 |             loss: str or Loss, loss function
 46 |             metric: None or str or Metric, metric
 47 |             ntrees: int, maximum number of trees
 48 |             lr: float, learning rate
 49 |             min_gain_to_split: float >=0, minimal gain to split
 50 |             lambda_l2: float > 0, l2 leaf regularization
 51 |             gd_steps: int > 0, number of gradient steps
 52 |             max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets
 53 |             min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated
 54 |                 with hessian values to speed up training
 55 |             colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling
 56 |             subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling
 57 | 
 58 |             quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform',
 59 |                 'Uniquant' or custom implementation
 60 |             quant_sample: int, subsample to quantize features
 61 |             max_bin: int in [2, 256] maximum number of bins to quantize features
 62 |             min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored
 63 | 
 64 |             es: int, early stopping rounds. If 0, no early stopping
 65 |             seed: int, random state
 66 |             verbose: int, verbosity freq
 67 |             sketch_outputs: int, number of outputs to keep
 68 |             sketch_method: str, name of the sketching strategy
 69 |             use_hess: bool, use hessians in multioutput training
 70 |             callbacks: list of Callback, callbacks to customize training are passed here
 71 |             sketch_params: dict, optional kwargs for sketching strategy
 72 |         """
 73 |         if sketch_params is None:
 74 |             sketch_params = {}
 75 | 
 76 |         if sketch_method == 'filter':
 77 |             sketch = FilterSketch(sketch_outputs, **sketch_params)
 78 | 
 79 |         elif sketch_method == 'svd':
 80 |             sketch = SVDSketch(sketch_outputs, **sketch_params)
 81 | 
 82 |         elif sketch_method == 'topk':
 83 |             sketch = TopOutputsSketch(sketch_outputs)
 84 | 
 85 |         elif sketch_method == 'rand':
 86 |             sketch = RandomSamplingSketch(sketch_outputs, **sketch_params)
 87 | 
 88 |         elif sketch_method == 'proj':
 89 |             sketch = RandomProjectionSketch(sketch_outputs, **sketch_params)
 90 | 
 91 |         elif sketch_method is None:
 92 |             sketch = None
 93 | 
 94 |         else:
 95 |             raise ValueError('Unknown sketching strategy')
 96 | 
 97 |         super().__init__(loss=loss,
 98 |                          metric=metric,
 99 |                          ntrees=ntrees,
100 |                          lr=lr,
101 |                          min_gain_to_split=min_gain_to_split,
102 |                          lambda_l2=lambda_l2,
103 |                          gd_steps=gd_steps,
104 |                          max_depth=max_depth,
105 |                          min_data_in_leaf=min_data_in_leaf,
106 |                          colsample=colsample,
107 |                          subsample=subsample,
108 | 
109 |                          quantization=quantization,
110 |                          quant_sample=quant_sample,
111 |                          max_bin=max_bin,
112 |                          min_data_in_bin=min_data_in_bin,
113 | 
114 |                          target_splitter='Single',
115 |                          multioutput_sketch=sketch,
116 |                          use_hess=use_hess,
117 |                          es=es,
118 |                          seed=seed,
119 |                          verbose=verbose,
120 |                          callbacks=callbacks)
121 | 


--------------------------------------------------------------------------------
/py_boost/gpu/tree.py:
--------------------------------------------------------------------------------
  1 | """Decision trees building and inference"""
  2 | 
  3 | try:
  4 |     import cupy as cp
  5 | except Exception:
  6 |     pass
  7 | import numpy as np
  8 | 
  9 | from .utils import apply_values, depthwise_grow_tree, get_tree_node, set_leaf_values, calc_node_values
 10 | from .utils import tree_prediction_leaves_typed_kernels, tree_prediction_leaves_typed_kernels_f
 11 | from .utils import tree_prediction_values_kernel
 12 | 
 13 | 
 14 | class Tree:
 15 |     """This class initializes an empty tree structure, implements methods to set tree values and single tree inference.
 16 |     The instance of this object represents the actual boosting step, but not the single tree!
 17 |     Actual amount of trees in the instance (at each boosting step) is defined by ngroups argument. What it means:
 18 |     Assume you have 5 class classification task, so you model output size equals 5. Possible cases here:
 19 |         - Build single decision tree that outputs a vector of 5 values. In this case ngroups eq. 1
 20 |         - Build 5 decision trees, each tree predict a value for its own class (one-vs-all).
 21 |             In this case ngroups eq. 5
 22 |         - Create custom target split strategy. For ex. you can build 2 trees, first will predict [0, 1, 2] classes,
 23 |             second - [3, 4]. In this case ngroups eq. 2
 24 | 
 25 |     Grouped trees structure is defined by arrays:
 26 |         feats, shape (ngroups, max_nodes) - feature index to use for the split in each group/node.
 27 |             If -1, the node is terminal (leaf)
 28 |         val_splits, shape (ngroups, max_nodes) - threshold to compare when choosing the next node
 29 |             if feature value is not NaN
 30 |         nans, shape (ngroups, max_nodes) - bool values, if True, NaN feature values objects moves left, else - right
 31 |         split, shape (ngroups, max_nodes, 2) - node indices corresponding left/right split for the current node
 32 | 
 33 |     Trees structure defines single node id value for each object
 34 |     Values assigned to the outputs are defined by arrays:
 35 |         group_index, shape (nout, ). Defines the group id for predicting each output
 36 |         values, shape (max_nodes, nout). Define output value for each node/output
 37 |         leaves, shape (max_leaves, ngroups). Assigns the leaf index to the terminal nodes
 38 | 
 39 |     During the fit stage, the format described above is used.
 40 |     After fitting, additional reformatting occurs that converts the tree to another format to achieve faster inference:
 41 |     - Sub-trees for each group are stored in one array named "test_format":
 42 |         [gr0_node0, ..., gr0_nodeN, gr1_node0, ..., gr1_nodeM, gr2_node0, ..., gr2_nodeK, gr3_node0, ...]
 43 |     - Each node in new formatted tree consists of 4 fields:
 44 |         [feature_index, split_value, left_node_index, right_node_index],
 45 |         feature_index - feature index to use for the split in each node.
 46 |         split_value - threshold to compare when choosing the next node if feature value is not NaN
 47 |         left_node_index - index of the left child in "test_format" array
 48 |         right_node_index - index of the right child in "test_format" array
 49 |     - The size of "test_format" array equals to the sum of all nodes in all subtrees except leaves multiplied by 4.
 50 |         Multiplication by 4 occurs because each node consists of the 4 fields described above.
 51 |         Examples:
 52 |             test_format[0 * 4] == test_format[0] - yields feature_index for node with index 0.
 53 |             test_format[0 * 4 + 1] == test_format[1] - yields split_value for node with index 0.
 54 |             test_format[0 * 4 + 2] == test_format[2] - yields left_node_index for node with index 0.
 55 |             test_format[0 * 4 + 3] == test_format[3] - yields right_node_index for node with index 0.
 56 |             test_format[1 * 4] == test_format[4]  - yields feature_index for node with index 1.
 57 |             test_format[1 * 4 + 1] == test_format[5] - yields split_value for node with index 1.
 58 |             test_format[1 * 4 + 2] == test_format[6] - yields left_node_index for node with index 1.
 59 |             test_format[1 * 4 + 3] == test_format[7] - yields right_node_index for node with index 1.
 60 |             test_format[2 * 4] == test_format[8]  - yields feature_index for node with index 2.
 61 |             ...
 62 |             test_format[79 * 4] == test_format[316]  - yields feature_index for node with index 79.
 63 |             test_format[79 * 4 + 1] == test_format[317] - yields split_value for node with index 79.
 64 |             test_format[79 * 4 + 2] == test_format[318] - yields left_node_index for node with index 79.
 65 |             test_format[79 * 4 + 3] == test_format[319] - yields right_node_index for node with index 79.
 66 |             ...
 67 |     - The sign of the feature_index value shows the behavior in case of feature == NaN (split to the left/right),
 68 |         to the value written in feature_index an extra "1" is added to deal with zero.
 69 |         Examples:
 70 |             feature_index == 8, positive value means that tree follows to the left in case of NaN in feature,
 71 |                 the real feature index is calculated as follows: abs(8) - 1 = 7.
 72 |             feature_index == -19, negative value means that tree follows to the right in case of NaN in feature,
 73 |                 the real feature index is calculated as follows: abs(-19) - 1 = 18.
 74 |             feature_index == 0, impossible due to construction algorithm.
 75 |     - If left_node_index/right_node_index is negative, it means that it shows index in the values array;
 76 |         In case of a negative value, an extra "1" is added to deal with zero.
 77 |         Examples:
 78 |             left_node_index == 8, non-negative value means that left child node is stored in "test_format" with index 8;
 79 |             left_node_index == -13, means that left child is a leaf, the index in "values" array for that leaf can
 80 |                 be calculated as follows: abs(-13) - 1 = 12. Thus, index in "values" array is 12.
 81 |     - All subtrees are stored in one array, so an additional array of indexes where each subtree is starting
 82 |         is required (index of the subtree roots), array "gr_subtree_offsets" stores these indexes,
 83 |         size of "gr_subtree_offsets" equals to number of groups in the tree (number of subtrees).
 84 |         Example:
 85 |             gr_subtree_offsets == [0, 56, 183], means that tree has 3 subtrees (3 groups).
 86 |             The first subtree has its root as node with index 0;
 87 |             The second subtree has its root as node with index 56;
 88 |             The third subtree has its root as node with index 183.
 89 |             Example how to access the values of the root node in the second subtree:
 90 |                 test_format[56 * 4] == test_format[224]  - yields feature_index for the root of the second subtree;
 91 |                 test_format[56 * 4 + 1] == test_format[225] - yields split_value for the root of the second subtree;
 92 |                 test_format[56 * 4 + 2] == test_format[226] - yields left_node_index for the root of the second subtree;
 93 |                 test_format[56 * 4 + 3] == test_format[227] - yields right_node_index for the root of the second subtree
 94 |     - Two fields, 'feature_importance_gain' and 'feature_importance_split', store feature importance arrays
 95 |         and describe the fitted tree accordingly.
 96 |     """
 97 | 
 98 |     def __init__(self, max_nodes, nout, ngroups):
 99 |         """Initialize empty tree
100 | 
101 |         Args:
102 |             max_nodes: int, maximum number of nodes in tree
103 |             nout: int, number of outputs in tree
104 |             ngroups: int, number of groups
105 |         """
106 |         self.nout = nout
107 |         self.ngroups = ngroups
108 |         self.max_nodes = max_nodes
109 | 
110 |         self.gains = np.zeros((ngroups, max_nodes,), dtype=np.float32)
111 |         self.feats = np.zeros((ngroups, max_nodes,), dtype=np.int64) - 1
112 |         self.bin_splits = np.zeros((ngroups, max_nodes,), dtype=np.int32)
113 |         self.nans = np.zeros((ngroups, max_nodes,), dtype=np.bool_)
114 | 
115 |         self.split = np.zeros((ngroups, max_nodes, 2), dtype=np.int32)
116 | 
117 |         self.val_splits = None
118 |         self.values = None
119 |         self.group_index = None
120 |         self.leaves = None
121 |         self.max_leaves = None
122 | 
123 |         self.feature_importance_gain = None
124 |         self.feature_importance_split = None
125 | 
126 |         self._debug = None
127 |         self.test_format = None
128 |         self.test_format_offsets = None
129 | 
130 |     def set_nodes(self, group, unique_nodes, new_nodes_id, best_feat, best_gain, best_split, best_nan_left):
131 |         """Write info about new nodes
132 | 
133 |         Args:
134 |             group: int, group id to write
135 |             unique_nodes: np.ndarray, nodes id to set info
136 |             new_nodes_id: np.ndarray, nodes id to left/right split current node
137 |             best_feat: np.ndarray, feature value to perform a split
138 |             best_gain: np.ndarray, gain from the split
139 |             best_split: np.ndarray, quantized threshold to compare when split
140 |             best_nan_left: np.ndarray, bool if True, nans moved in the left node, else right
141 | 
142 |         Returns:
143 | 
144 |         """
145 | 
146 |         self.gains[group, unique_nodes] = best_gain
147 |         self.feats[group, unique_nodes] = best_feat
148 |         self.bin_splits[group, unique_nodes] = best_split
149 |         self.nans[group, unique_nodes] = best_nan_left
150 |         self.split[group, unique_nodes] = new_nodes_id
151 | 
152 |     def set_node_values(self, values, group_index):
153 |         """Assign output values for each nodes
154 | 
155 |         Args:
156 |             values: np.ndarray, node values
157 |             group_index: np.ndarray, group id of each output
158 | 
159 |         Returns:
160 | 
161 |         """
162 |         self.values = values
163 |         self.group_index = group_index
164 | 
165 |     def set_borders(self, borders):
166 |         """Assign actual feature values based on quantized
167 | 
168 |         Args:
169 |             borders: list of np.ndarray, actual node values
170 | 
171 |         Returns:
172 | 
173 |         """
174 |         # borders - list of arrays. Array is borderlines
175 |         val_splits = [0 if x == -1 else borders[x][min(y, len(borders[x]) - 1)]
176 |                       for (x, y) in zip(self.feats.ravel(), self.bin_splits.ravel())]
177 |         self.val_splits = np.array(val_splits, dtype=np.float32).reshape(self.feats.shape)
178 | 
179 |     def set_leaves(self):
180 |         """Assign leaf id to the terminal nodes
181 | 
182 |         Returns:
183 | 
184 |         """
185 |         self.leaves, self.max_leaves = set_leaf_values(self.feats, self.split)
186 | 
187 |     def to_device(self):
188 |         """Move tree data to the current GPU memory
189 | 
190 |         Returns:
191 | 
192 |         """
193 |         for attr in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'values', 'group_index', 'leaves',
194 |                      'test_format', 'test_format_offsets', 'feature_importance_gain', 'feature_importance_split']:
195 |             arr = getattr(self, attr)
196 | 
197 |             if type(arr) is np.ndarray:
198 |                 setattr(self, attr, cp.asarray(arr))
199 | 
200 |     def to_cpu(self):
201 |         """Move tree data to the CPU memory
202 | 
203 |         Returns:
204 | 
205 |         """
206 |         for attr in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'values', 'group_index', 'leaves',
207 |                      'test_format', 'test_format_offsets', 'feature_importance_gain', 'feature_importance_split']:
208 |             arr = getattr(self, attr)
209 | 
210 |             if type(arr) is cp.ndarray:
211 |                 setattr(self, attr, arr.get())
212 | 
213 |     def _predict_node_deprecated(self, X):
214 |         """(DEPRECATED) Predict node id from the feature matrix X
215 | 
216 |         Args:
217 |             X: cp.ndarray of features
218 | 
219 |         Returns:
220 | 
221 |         """
222 |         if self.feats is None:
223 |             raise Exception('To use _deprecated funcs pass debug=True to .reformat')
224 | 
225 |         assert type(self.feats) is cp.ndarray, 'Should be moved to GPU first. Call .to_device()'
226 |         nodes = get_tree_node(X, self.feats, self.val_splits, self.split, self.nans)
227 |         return nodes
228 | 
229 |     def _predict_from_nodes_deprecated(self, nodes):
230 |         """(DEPRECATED) Predict outputs from the nodes indices
231 | 
232 |         Args:
233 |             nodes: cp.ndarray of predicted nodes
234 | 
235 |         Returns:
236 |             cp.ndarray of nodes
237 |         """
238 |         return apply_values(nodes, self.group_index, self.values)
239 | 
240 |     def _predict_leaf_from_nodes_deprecated(self, nodes):
241 |         """Predict leaf indices from the nodes indices (Use predict_leaf() method if you need to predict leaves)
242 | 
243 |         Args:
244 |             nodes: cp.ndarray of predicted nodes
245 | 
246 |         Returns:
247 |             cp.ndarray of leaves
248 |         """
249 |         return apply_values(nodes, cp.arange(self.ngroups, dtype=cp.uint64), self.leaves)
250 | 
251 |     def _predict_deprecated(self, X):
252 |         """(DEPRECATED) Predict from the feature matrix X
253 | 
254 |         Args:
255 |             X: cp.ndarray of features
256 | 
257 |         Returns:
258 |             cp.ndarray of predictions
259 |         """
260 |         return self._predict_from_nodes_deprecated(
261 |             self._predict_leaf_from_nodes_deprecated(self._predict_node_deprecated(X)))
262 | 
263 |     def _predict_leaf_deprecated(self, X):
264 |         """(DEPRECATED) Predict leaf indices from the feature matrix X
265 | 
266 |         Args:
267 |             X: cp.ndarray of features
268 | 
269 |         Returns:
270 |             cp.ndarray of leaves
271 |         """
272 |         return self._predict_leaf_from_nodes_deprecated(self._predict_node_deprecated(X))
273 | 
274 |     def predict_leaf(self, X, pred_leaves=None):
275 |         """Predict leaf indexes from the feature matrix X
276 | 
277 |         Args:
278 |             X: cp.ndarray, array of features
279 |             pred_leaves: cp.ndarray, buffer for predictions
280 | 
281 |         Returns:
282 |             pred_leaves: leaf predictions
283 | 
284 |         """
285 |         # check if buffer is None and X on GPU
286 |         assert type(X) is cp.ndarray, "X must be type of cp.ndarray (located on gpu)"
287 | 
288 |         dt = str(X.dtype)
289 | 
290 |         assert dt in tree_prediction_leaves_typed_kernels, \
291 |             f"X array must be of type: {list(tree_prediction_leaves_typed_kernels.keys())}"
292 | 
293 |         if pred_leaves is None:
294 |             pred_leaves = cp.empty((X.shape[0], self.ngroups), dtype=cp.int32)
295 | 
296 |         # CUDA parameters initialization
297 |         threads = 128  # threads in one CUDA block
298 |         sz = X.shape[0] * self.ngroups
299 |         blocks = sz // threads
300 |         if sz % threads != 0:
301 |             blocks += 1
302 | 
303 |         if X.flags["C_CONTIGUOUS"]:
304 |             tree_prediction_leaves_typed_kernels[dt]((blocks,), (threads,), ((X,
305 |                                                                               self.test_format,
306 |                                                                               self.test_format_offsets,
307 |                                                                               X.shape[1],
308 |                                                                               X.shape[0],
309 |                                                                               self.ngroups,
310 |                                                                               pred_leaves.shape[1],
311 |                                                                               pred_leaves)))
312 |         elif X.flags["F_CONTIGUOUS"]:
313 |             tree_prediction_leaves_typed_kernels_f[dt]((blocks,), (threads,), ((X,
314 |                                                                                 self.test_format,
315 |                                                                                 self.test_format_offsets,
316 |                                                                                 X.shape[1],
317 |                                                                                 X.shape[0],
318 |                                                                                 self.ngroups,
319 |                                                                                 pred_leaves.shape[1],
320 |                                                                                 pred_leaves)))
321 |         else:
322 |             raise Exception("X must be 'C_CONTIGUOUS' or 'F_CONTIGUOUS'")
323 |         return pred_leaves
324 | 
325 |     def predict(self, X, pred=None, pred_leaves=None):
326 |         """Predict from the feature matrix X
327 | 
328 |         Args:
329 |             X: cp.ndarray, array of features
330 |             pred: cp.ndarray, buffer for predictions on GPU, if None - created automatically
331 |             pred_leaves: cp.ndarray, buffer for internal leaf predictions on GPU, if None - created automatically
332 | 
333 |         Returns:
334 |             pred: cp.ndarray, prediction array
335 | 
336 |         """
337 |         # check if buffers are None
338 |         if pred is None:
339 |             pred = cp.zeros((X.shape[0], self.nout), dtype=cp.float32)
340 |         if pred_leaves is None:
341 |             pred_leaves = cp.empty((X.shape[0], self.ngroups), dtype=cp.int32)
342 | 
343 |         # first step - leaves predictions, actually prediction of indexes in values
344 |         self.predict_leaf(X, pred_leaves)
345 | 
346 |         # CUDA parameters initialization
347 |         threads = 128  # threads in one CUDA block
348 |         sz = X.shape[0] * self.nout
349 |         blocks = sz // threads
350 |         if sz % threads != 0:
351 |             blocks += 1
352 | 
353 |         # second step, prediction of actual values
354 |         tree_prediction_values_kernel((blocks,), (threads,), ((pred_leaves,
355 |                                                                self.group_index,
356 |                                                                self.values,
357 |                                                                self.nout,
358 |                                                                X.shape[0],
359 |                                                                pred_leaves.shape[1],
360 |                                                                pred)))
361 |         return pred
362 | 
363 |     def reformat(self, nfeats, debug):
364 |         """Creates new internal format of the tree for faster inference
365 |         
366 |         Args:
367 |             nfeats: int, number of features in X (train set)
368 |             debug: bool, if in debug mode
369 | 
370 |         Returns:
371 | 
372 |         """
373 |         n_gr = self.ngroups
374 | 
375 |         # memory allocation for new tree array
376 |         gr_subtree_offsets = np.zeros(n_gr, dtype=np.int32)
377 |         check_empty = []
378 |         total_size = 0
379 |         for i in range(n_gr):
380 |             curr_size = int((self.feats[i] >= 0).sum())
381 |             # add special case handling - single leaf, no splits
382 |             check_empty.append(curr_size == 0)
383 |             curr_size = max(1, curr_size)
384 |             total_size += curr_size
385 | 
386 |             if i < n_gr - 1:
387 |                 gr_subtree_offsets[i + 1] = total_size
388 |         nf = np.zeros(total_size * 4, dtype=np.float32)
389 | 
390 |         # reformatting the tree
391 |         for i in range(n_gr):
392 |             # handle special case - single leaf, no splits - make a pseudo split node
393 |             if check_empty[i]:
394 |                 nf[4 * gr_subtree_offsets[i]] = 1.
395 |                 nf[4 * gr_subtree_offsets[i] + 1] = 0.
396 |                 nf[4 * gr_subtree_offsets[i] + 2] = -1.
397 |                 nf[4 * gr_subtree_offsets[i] + 3] = -1.
398 | 
399 |                 continue
400 | 
401 |             q = [(0, 0)]
402 | 
403 |             while len(q) != 0:  # BFS in tree
404 |                 n_old, n_new = q[0]
405 |                 if not self.nans[i][n_old]:
406 |                     nf[4 * (gr_subtree_offsets[i] + n_new)] = float(self.feats[i][n_old] + 1)
407 |                 else:
408 |                     nf[4 * (gr_subtree_offsets[i] + n_new)] = float(-(self.feats[i][n_old] + 1))
409 |                 nf[4 * (gr_subtree_offsets[i] + n_new) + 1] = float(self.val_splits[i][n_old])
410 |                 ln = self.split[i][n_old][0]
411 |                 rn = self.split[i][n_old][1]
412 | 
413 |                 if self.feats[i][ln] < 0:
414 |                     nf[4 * (gr_subtree_offsets[i] + n_new) + 2] = float(-(self.leaves[ln][i] + 1))
415 |                 else:
416 |                     new_node_number = q[-1][1] + 1
417 |                     nf[4 * (gr_subtree_offsets[i] + n_new) + 2] = float(new_node_number)
418 |                     q.append((ln, new_node_number))
419 | 
420 |                 if self.feats[i][rn] < 0:
421 |                     nf[4 * (gr_subtree_offsets[i] + n_new) + 3] = float(-(self.leaves[rn][i] + 1))
422 |                 else:
423 |                     new_node_number = q[-1][1] + 1
424 |                     nf[4 * (gr_subtree_offsets[i] + n_new) + 3] = float(new_node_number)
425 |                     q.append((rn, new_node_number))
426 |                 q.pop(0)
427 | 
428 |         self.test_format = nf
429 |         self.test_format_offsets = gr_subtree_offsets
430 | 
431 |         # feature_ importance with gain
432 |         self.feature_importance_gain = np.zeros(nfeats, dtype=np.float32)
433 |         sl = self.feats >= 0
434 |         np.add.at(self.feature_importance_gain, self.feats[sl], self.gains[sl])
435 | 
436 |         # feature_ importance with split
437 |         self.feature_importance_split = np.zeros(nfeats, dtype=np.float32)
438 |         sl = self.feats >= 0
439 |         np.add.at(self.feature_importance_split, self.feats[sl], 1)
440 | 
441 |         if not debug:
442 |             for attr in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'leaves']:
443 |                 setattr(self, attr, None)
444 | 
445 | 
446 | class DepthwiseTreeBuilder:
447 |     """This class builds decision tree with given parameters"""
448 | 
449 |     def __init__(self, borders,
450 |                  use_hess=True,
451 |                  colsampler=None,
452 |                  subsampler=None,
453 |                  target_splitter=None,
454 |                  multioutput_sketch=None,
455 |                  gd_steps=1,
456 |                  **tree_params
457 |                  ):
458 |         """
459 | 
460 |         Args:
461 |             borders: list of np.ndarray, actual split borders for quantized features
462 |             colsampler: Callable or None, column sampling strategy
463 |             subsampler: Callable or None, rows sampling strategy
464 |             target_splitter: Callable or None, target grouping strategy
465 |             multioutput_sketch: Callable or None, multioutput sketching strategy
466 |             **tree_params: other tree building parameters
467 |         """
468 |         self.borders = borders
469 |         self.use_hess = use_hess
470 |         self.params = {**{
471 | 
472 |             'lr': 1.,
473 |             'lambda_l2': .01,
474 |             'max_bin': 256,
475 |             'max_depth': 6,
476 |             'min_data_in_leaf': 10,
477 |             'min_gain_to_split': 0
478 |         }, **tree_params}
479 | 
480 |         self.colsampler = colsampler
481 |         self.subsampler = subsampler
482 |         self.target_grouper = target_splitter
483 |         self.multioutput_sketch = multioutput_sketch
484 |         self.gd_steps = gd_steps
485 | 
486 |     def build_tree(self, X, grad, hess, sample_weight=None, grad_fn=None, *val_arrays):
487 |         """Build tree and return nodes/values predictions for train and validation sets
488 | 
489 |         Args:
490 |             X: cp.ndarray, quantized feature matrix
491 |             grad: cp.ndarray, gradient matrix
492 |             hess: cp.ndarray, hessian matrix
493 |             sample_weight: cp.ndarray or None, sample's weights
494 |             grad_fn: gradient fn
495 |             *val_arrays: list of cp.ndarray, list of quantized features for validation sets
496 | 
497 |         Returns:
498 |             tree, Tree, constructed tree
499 |             nodes_group, cp.ndarray, nodes id for the train set
500 |             pred, cp.ndarray, prediction for the train set
501 |             valid_nodes_group, list of cp.ndarray, list of predicted nodes for valid sets
502 |             val_preds, list of cp.ndarray, list of predictions for valid sets
503 |         """
504 |         if self.colsampler is None:
505 |             col_indexer = cp.arange(X.shape[1], dtype=cp.uint64)
506 |         else:
507 |             col_indexer = self.colsampler()
508 | 
509 |         if self.subsampler is None:
510 |             row_indexer = cp.arange(X.shape[0], dtype=cp.uint64)
511 |         else:
512 |             row_indexer = self.subsampler()
513 | 
514 |         if self.target_grouper is None:
515 |             output_groups = [cp.arange(grad.shape[1], dtype=cp.uint64)]
516 |         else:
517 |             output_groups = self.target_grouper()
518 | 
519 |         if sample_weight is not None:
520 |             grad = grad * sample_weight
521 |             hess = hess * sample_weight
522 | 
523 |         max_nodes = int((2 ** np.arange(self.params['max_depth'] + 1)).sum())
524 |         tree = Tree(max_nodes, grad.shape[1], len(output_groups))
525 | 
526 |         nodes_group = cp.empty((grad.shape[0], len(output_groups)), dtype=cp.int32)
527 |         valid_nodes_group = [cp.empty((x.shape[0], len(output_groups)), dtype=cp.int32) for x in val_arrays]
528 | 
529 |         group_index = cp.zeros(grad.shape[1], dtype=cp.uint64)
530 | 
531 |         for n_grp, grp_indexer in enumerate(output_groups):
532 |             G = grad[:, grp_indexer]
533 |             # if output group len eq. 1, we have single output tree, use hess for structure search
534 |             if G.shape[1] == 1:
535 |                 H = hess if hess.shape[1] == 1 else hess[:, grp_indexer]
536 |             # else we can decide: should we use hess for tree structure search or
537 |             # assume hess eq. sample weight for all outputs, and then we can use proxy for tree structure search
538 |             else:
539 |                 if self.use_hess:
540 |                     H = hess[:, grp_indexer]
541 |                 else:
542 |                     H = sample_weight if sample_weight is not None else cp.ones((G.shape[0], 1), dtype=cp.float32)
543 |                 if self.multioutput_sketch is not None:
544 |                     G, H = self.multioutput_sketch(G, H)
545 | 
546 |             group_index[grp_indexer] = n_grp
547 |             # grow single group of the tree and get nodes index
548 |             train_nodes, valid_nodes = depthwise_grow_tree(tree, n_grp, X, G, H,
549 |                                                            row_indexer, col_indexer, self.params,
550 |                                                            valid_arrs=val_arrays)
551 |             # update nodes group
552 |             nodes_group[:, n_grp] = train_nodes
553 |             for vn, vp in zip(valid_nodes_group, valid_nodes):
554 |                 vn[:, n_grp] = vp
555 | 
556 |         # transform nodes to leaves
557 |         tree.set_leaves()
558 |         leaves_idx, max_leaves, leaves_grp = cp.asarray(tree.leaves, dtype=cp.int32), tree.max_leaves, \
559 |                                              cp.arange(len(output_groups), dtype=cp.uint64)
560 | 
561 |         leaves = apply_values(nodes_group, leaves_grp, leaves_idx)
562 |         val_leaves = [apply_values(x, leaves_grp, leaves_idx) for x in valid_nodes_group]
563 | 
564 |         # perform multiple grad steps
565 |         values = calc_node_values(grad, hess, leaves, row_indexer, group_index, max_leaves, self.params['lr'],
566 |                                   lambda_l2=self.params['lambda_l2'])
567 |         pred = apply_values(leaves, group_index, values)
568 | 
569 |         tree.set_borders(self.borders)
570 | 
571 |         for i in range(1, self.gd_steps):
572 |             grad, hess = grad_fn(pred)
573 |             values += calc_node_values(grad, hess, leaves, row_indexer, group_index, max_leaves, self.params['lr'],
574 |                                        lambda_l2=self.params['lambda_l2'])
575 |             pred = apply_values(leaves, group_index, values)
576 | 
577 |         # transform leaves to values
578 |         val_preds = [apply_values(x, group_index, values) for x in val_leaves]
579 |         tree.set_node_values(values.get(), group_index.get())
580 | 
581 |         return tree, leaves, pred, val_leaves, val_preds
582 | 


--------------------------------------------------------------------------------
/py_boost/multioutput/__init__.py:
--------------------------------------------------------------------------------
1 | """Provides tools to multioutput models handling"""
2 | 


--------------------------------------------------------------------------------
/py_boost/multioutput/sketching.py:
--------------------------------------------------------------------------------
  1 | """Defines sketching strategies to simplify multioutput scoring function calculation"""
  2 | 
  3 | try:
  4 |     import cupy as cp
  5 | except Exception:
  6 |     pass
  7 | 
  8 | try:
  9 |     from cuml import TruncatedSVD
 10 | except ImportError:
 11 |     pass
 12 | 
 13 | from ..callbacks.callback import Callback
 14 | 
 15 | 
 16 | class GradSketch(Callback):
 17 |     """Basic class for sketching strategy.
 18 |     It should implement __call__ method.
 19 |     """
 20 | 
 21 |     def __call__(self, grad, hess):
 22 |         """Method receive raw grad and hess matrices and output new ones that will be used in the tree structure search
 23 | 
 24 |         Args:
 25 |             grad: cp.ndarray, gradients
 26 |             hess: cp.ndarray, hessians
 27 | 
 28 |         Returns:
 29 |             cp.ndarray, sketched grad
 30 |             cp.ndarray, sketched hess
 31 |         """
 32 |         return grad, hess
 33 | 
 34 | 
 35 | class TopOutputsSketch(GradSketch):
 36 |     """TopOutputs sketching. Use only gradient columns with the highest L2 norm"""
 37 | 
 38 |     def __init__(self, topk=1):
 39 |         """
 40 | 
 41 |         Args:
 42 |             topk: int, top outputs to use
 43 |         """
 44 |         self.topk = topk
 45 | 
 46 |     def __call__(self, grad, hess):
 47 |         best_idx = (grad ** 2).mean(axis=0).argsort()[-self.topk:]
 48 |         grad = grad[:, best_idx]
 49 | 
 50 |         if hess.shape[1] > 1:
 51 |             hess = hess[:, best_idx]
 52 | 
 53 |         return grad, hess
 54 | 
 55 | 
 56 | class SVDSketch(GradSketch):
 57 |     """SVD Sketching. Truncated SVD is used to reduce grad dimensions."""
 58 | 
 59 |     def __init__(self, sample=None, **svd_params):
 60 |         """
 61 | 
 62 |         Args:
 63 |             sample: int, subsample to speed up SVD fitting
 64 |             **svd_params: dict, SVD params, see cuml.TruncatedSVD docs
 65 |         """
 66 |         self.svd_params = {**{'algorithm': 'jacobi', 'n_components': 5, 'n_iter': 5}, **svd_params}
 67 |         self.sample = sample
 68 |         self.svd = None
 69 | 
 70 |     def before_train(self, build_info):
 71 |         self.svd = TruncatedSVD(output_type='cupy', **self.svd_params)
 72 | 
 73 |     def __call__(self, grad, hess):
 74 | 
 75 |         sub_grad = grad
 76 |         if (self.sample is not None) and (grad.shape[0] > self.sample):
 77 |             idx = cp.arange(grad.shape[0], dtype=cp.int32)
 78 |             cp.random.shuffle(idx)
 79 |             sub_grad = grad[idx[:self.sample]]
 80 | 
 81 |         self.svd.fit(sub_grad)
 82 |         grad = self.svd.transform(grad)
 83 | 
 84 |         if hess.shape[1] > 1:
 85 |             hess = self.svd.transform(hess)
 86 |             hess = cp.clip(hess, 0.01, None)
 87 | 
 88 |         return grad, hess
 89 | 
 90 |     def after_iteration(self, build_info):
 91 |         """Free memory to avoid OOM.
 92 | 
 93 |         Args:
 94 |             build_info: dict
 95 | 
 96 |         Returns:
 97 | 
 98 |         """
 99 |         build_info['mempool'].free_all_blocks()
100 | 
101 |     def after_train(self, build_info):
102 |         self.svd = None
103 | 
104 | 
105 | class RandomSamplingSketch(GradSketch):
106 |     """RandomSampling Sketching. Gradient columns are randomly sampled with probabilities."""
107 | 
108 |     def __init__(self, n=1, smooth=0.1, replace=True):
109 |         """
110 | 
111 |         Args:
112 |             n: int, n outputs to select
113 |             smooth: float, 0 stands for probabilities proportionally to the sum of squares, 1 stands for uniform.
114 |                 (0, 1) stands for tradeoff
115 |         """
116 |         self.n = n
117 |         self.smooth = smooth
118 |         self.replace = replace
119 | 
120 |     def __call__(self, grad, hess):
121 |         best_idx = (grad ** 2).mean(axis=0) + 1e-3
122 |         pi = best_idx / best_idx.sum()
123 |         pi = self.smooth * cp.ones_like(pi) / grad.shape[1] + (1 - self.smooth) * pi
124 | 
125 |         gg = grad / cp.sqrt(self.n * pi)
126 |         rand_idx = cp.random.choice(cp.arange(grad.shape[1]), size=self.n, replace=self.replace, p=pi)
127 |         grad = gg[:, rand_idx]
128 | 
129 |         if hess.shape[1] > 1:
130 |             hess = hess[:, rand_idx]
131 | 
132 |         return grad, hess
133 | 
134 | 
135 | class RandomProjectionSketch(GradSketch):
136 |     """Random projection sketch"""
137 | 
138 |     def __init__(self, n=1, norm=True):
139 |         """
140 | 
141 |         Args:
142 |             n: int, number of output dimensions
143 |             norm: if True use normal distribution, otherwise +1/-1
144 |         """
145 |         self.k = n
146 |         self.norm = norm
147 | 
148 |     def __call__(self, grad, hess):
149 | 
150 |         if self.norm:
151 |             P = cp.random.randn(grad.shape[1], self.k, dtype=cp.float32)
152 |         else:
153 |             P = (cp.random.rand(grad.shape[1], self.k, dtype=cp.float32) > .5).astype(cp.float32) * 2 - 1
154 | 
155 |         P /= cp.sqrt(1 / self.k)
156 | 
157 |         grad = cp.dot(grad, P)
158 | 
159 |         if hess.shape[1] > 1:
160 |             hess = cp.dot(hess, P)
161 |             hess = cp.clip(hess, 0.01, None)
162 | 
163 |         return grad, hess
164 | 
165 | 
166 | class FilterSketch(GradSketch):
167 |     """Filter Gradient and Hessian outputs for the tree structure search using previously built trees"""
168 | 
169 |     def __init__(self, k=1, sample=True, smooth=0.1, ntrees=1):
170 |         """
171 | 
172 |         Args:
173 |             k: int, number of outputs to keep
174 |             sample: bool, if True random sampling is used, else keep top K
175 |             smooth: float, smoothing parameter for sampling
176 |             ntrees: int, number of previously built trees to evaluate weights
177 |         """
178 |         self.k = k
179 |         self.sample = sample
180 |         self.smooth = smooth
181 |         self.ntrees = ntrees
182 | 
183 |         self.queue = None
184 |         self.max_trees = 0
185 |         self.max_nodes = 0
186 |         self.nrows = None
187 |         self.lambda_l2 = None
188 | 
189 |     def before_train(self, build_info):
190 |         """Extract metadata before train
191 | 
192 |         Args:
193 |             build_info: dict
194 | 
195 |         Returns:
196 | 
197 |         """
198 |         # length of train
199 |         self.nrows = build_info['data']['train']['features_gpu'].shape[0]
200 |         # lambda_l2 of tree builder
201 |         self.lambda_l2 = build_info['builder'].params['lambda_l2']
202 |         self.queue = []
203 |         self.max_nodes = 0
204 |         self.max_trees = 0
205 | 
206 |     def before_iteration(self, build_info):
207 |         """Method to extract leaf indices from the last tree
208 | 
209 |         Args:
210 |             build_info: dict
211 | 
212 |         Returns:
213 | 
214 |         """
215 |         # num iter
216 |         num_iter = build_info['num_iter']
217 | 
218 |         # if first iter - assume single node
219 |         # if smooth eq 1 - assume totally random choice
220 |         if num_iter == 0 or self.smooth >= 1 or self.ntrees == 0:
221 |             return
222 | 
223 |         if len(self.queue) >= self.ntrees:
224 |             self.queue.pop(0)
225 | 
226 |         # leaf values
227 |         last_leaves = build_info['data']['train']['last_tree']['leaves']
228 |         self.queue.append(last_leaves)
229 |         self.max_nodes = max(int(last_leaves.max()), self.max_nodes)
230 |         self.max_trees = max(last_leaves.shape[1], self.max_trees)
231 | 
232 |     def after_train(self, build_info):
233 |         """Clean trees
234 | 
235 |         Args:
236 |             build_info: dict
237 | 
238 |         Returns:
239 | 
240 |         """
241 |         self.queue = None
242 | 
243 |     def _calc_weights(self, grad, hess):
244 | 
245 |         if self.smooth >= 1:
246 |             return 0
247 | 
248 |         if len(self.queue) == 0:
249 |             loss = (grad ** 2).sum(axis=0)
250 |             loss /= hess.sum(axis=0)
251 | 
252 |             return loss
253 | 
254 |         grad_sum = cp.zeros((len(self.queue), self.max_trees, self.max_nodes,
255 |                              grad.shape[1],), dtype=cp.float32)
256 |         hess_sum = cp.zeros((len(self.queue), self.max_trees, self.max_nodes,
257 |                              hess.shape[1],), dtype=cp.float32)
258 | 
259 |         for n, prev_iter in enumerate(self.queue):
260 |             for i in range(prev_iter.shape[1]):
261 |                 grad_sum[n, i].scatter_add(prev_iter[:, i], grad)
262 |                 hess_sum[n, i].scatter_add(prev_iter[:, i], hess)
263 | 
264 |         loss = (grad_sum ** 2 / (hess_sum + self.lambda_l2)).reshape((-1, grad.shape[1]))
265 | 
266 |         return loss.sum(axis=0)
267 | 
268 |     def _select(self, pi):
269 | 
270 |         if self.sample:
271 |             idx = cp.random.choice(cp.arange(pi.shape[0]), size=self.k, replace=True, p=pi)
272 |         else:
273 |             idx = pi.argsort()[-self.k:]
274 | 
275 |         return idx
276 | 
277 |     def __call__(self, grad, hess):
278 | 
279 |         pi = self._calc_weights(grad, hess)
280 |         pi = pi / pi.sum()
281 | 
282 |         if self.smooth > 0:
283 |             pi = self.smooth * cp.ones_like(pi) / grad.shape[1] + (1 - self.smooth) * pi
284 | 
285 |         idx = self._select(pi)
286 | 
287 |         grad = grad[:, idx]
288 | 
289 |         if hess.shape[1] > 1:
290 |             hess = hess[:, idx]
291 | 
292 |         return grad, hess
293 | 


--------------------------------------------------------------------------------
/py_boost/multioutput/target_splitter.py:
--------------------------------------------------------------------------------
 1 | """Strategies to splitting multiple outputs by different trees"""
 2 | 
 3 | try:
 4 |     import cupy as cp
 5 | except Exception:
 6 |     pass
 7 | 
 8 | from ..callbacks.callback import Callback
 9 | 
10 | 
11 | class SingleSplitter(Callback):
12 |     """Basic splitter, means no split. Single tree will be built at each boosting step"""
13 | 
14 |     def __init__(self):
15 |         self.ensemble_indexer = None
16 |         self.indexer = None
17 | 
18 |     def before_iteration(self, build_info):
19 |         """Initialize indexers
20 | 
21 |         Args:
22 |             build_info: dict
23 | 
24 |         Returns:
25 | 
26 |         """
27 |         if build_info['num_iter'] == 0:
28 |             nout = build_info['data']['train']['grad'].shape[1]
29 |             self.indexer = cp.arange(nout, dtype=cp.uint64)
30 | 
31 |     def __call__(self):
32 |         """Get list of indexers for each group
33 | 
34 |         Returns:
35 |             list of cp.ndarrays of indexers
36 |         """
37 |         return [self.indexer]
38 | 
39 |     def after_train(self, build_info):
40 |         """Clean state not to keep the indexer in trained model
41 | 
42 |         Args:
43 |             build_info:
44 | 
45 |         Returns:
46 | 
47 |         """
48 |         self.__init__()
49 | 
50 | 
51 | class RandomGroupsSplitter(SingleSplitter):
52 |     """Random Groups Splitter, means all outputs will be randomly grouped at each iteration.
53 |     Single tree will be created for each group.
54 |     """
55 | 
56 |     def __init__(self, ngroups=2):
57 |         """
58 | 
59 |         Args:
60 |             ngroups: int, maximum number of groups to split outputs
61 |         """
62 |         super().__init__()
63 |         self.ngroups = ngroups
64 |         self._ngroups = None
65 | 
66 |     def before_iteration(self, build_info):
67 |         """Update groups count with the actual target shape if needed
68 | 
69 |         Args:
70 |             build_info: dict
71 | 
72 |         Returns:
73 | 
74 |         """
75 |         super().before_iteration(build_info)
76 |         if build_info['num_iter'] == 0:
77 |             self._ngroups = min(self.ngroups, build_info['data']['train']['grad'].shape[1])
78 | 
79 |     def __call__(self):
80 |         """
81 | 
82 |         Returns:
83 |             list of cp.ndarrays of indexers
84 |         """
85 |         cp.random.shuffle(self.indexer)
86 |         return cp.array_split(self.indexer, self._ngroups)
87 | 
88 | 
89 | class OneVsAllSplitter(SingleSplitter):
90 |     """One-Vs-All splitter, means build separate tree for each output"""
91 | 
92 |     def __call__(self):
93 |         """
94 | 
95 |         Returns:
96 |             list of cp.ndarrays of indexers
97 |         """
98 |         return cp.array_split(self.indexer, self.indexer.shape[0])
99 | 


--------------------------------------------------------------------------------
/py_boost/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | """Tools for quantization"""
2 | 


--------------------------------------------------------------------------------
/py_boost/quantization/base.py:
--------------------------------------------------------------------------------
  1 | """Basic quantizer implementations"""
  2 | 
  3 | import numpy as np
  4 | from .utils import apply_borders, quantize_features, numba_quantile_1d, numba_uniform_1d, numba_uniquant_1d
  5 | 
  6 | 
  7 | class Quantizer:
  8 |     """
  9 |     General class for all quantizers
 10 |     """
 11 | 
 12 |     def __init__(self, sample=None, max_bin=256, min_data_in_bin=3, random_state=42):
 13 |         """
 14 | 
 15 |         Args:
 16 |             sample: None or int, subsample size for quantizers
 17 |             max_bin: int, max bins
 18 |             min_data_in_bin: int, min bin size
 19 |             random_state: int
 20 |         """
 21 |         self.sample = sample
 22 |         # actual nbins eq max_bin - 1, zero bin is always reserved for NaNs
 23 |         self.max_bin = max_bin
 24 |         self.min_data_in_bin = min_data_in_bin
 25 |         self.random_state = random_state
 26 | 
 27 |         self.borders = None
 28 | 
 29 |     def _sample(self, X):
 30 |         """Sample train set
 31 | 
 32 |         Args:
 33 |             X: np.ndarray
 34 | 
 35 |         Returns:
 36 | 
 37 |         """
 38 |         if self.sample is not None and self.sample < X.shape[0]:
 39 |             np.random.seed(self.random_state)
 40 | 
 41 |             idx = np.arange(X.shape[0])
 42 |             np.random.shuffle(idx)
 43 |             idx = idx[:self.sample]
 44 |             return X[idx]
 45 | 
 46 |         return X
 47 | 
 48 |     def transform(self, X):
 49 |         """Apply borders is similar for all quantizers
 50 | 
 51 |         Args:
 52 |             X: np.ndarray
 53 | 
 54 |         Returns:
 55 | 
 56 |         """
 57 |         return apply_borders(X, self.borders)
 58 | 
 59 |     def fit(self, X):
 60 |         """Fit quantizer
 61 | 
 62 |         Args:
 63 |             X: np.ndarray
 64 | 
 65 |         Returns:
 66 | 
 67 |         """
 68 |         return self
 69 | 
 70 |     def fit_transform(self, X):
 71 |         """Fit quantizer and transform
 72 | 
 73 |         Args:
 74 |             X:
 75 | 
 76 |         Returns:
 77 | 
 78 |         """
 79 |         self.fit(X)
 80 | 
 81 |         return self.transform(X)
 82 | 
 83 |     def get_borders(self):
 84 |         """Get fitted borders
 85 | 
 86 |         Returns:
 87 | 
 88 |         """
 89 |         assert self.borders is not None, 'Should be fitted first'
 90 | 
 91 |         return self.borders
 92 | 
 93 |     def get_max_bin(self):
 94 |         """Get actual max bins
 95 | 
 96 |         Returns:
 97 | 
 98 |         """
 99 |         return max(map(len, self.get_borders()))
100 | 
101 | 
102 | class QuantileQuantizer(Quantizer):
103 |     """
104 |     Quantization by quantiles
105 |     """
106 | 
107 |     def fit(self, X):
108 |         self.borders = quantize_features(
109 | 
110 |             numba_quantile_1d,
111 |             self._sample(X),
112 |             max_bins=self.max_bin - 1,
113 |             min_data_in_bin=self.min_data_in_bin
114 | 
115 |         )
116 | 
117 |         return self
118 | 
119 | 
120 | class UniformQuantizer(Quantizer):
121 |     """
122 |     Uniform quantization
123 |     """
124 | 
125 |     def fit(self, X):
126 |         self.borders = quantize_features(
127 | 
128 |             numba_uniform_1d,
129 |             self._sample(X),
130 |             max_bins=self.max_bin - 1,
131 |             min_data_in_bin=self.min_data_in_bin
132 | 
133 |         )
134 | 
135 |         return self
136 | 
137 | 
138 | class UniquantQuantizer(Quantizer):
139 |     """
140 |     Mix of uniform and quantile bins
141 |     """
142 | 
143 |     def fit(self, X):
144 |         self.borders = quantize_features(
145 | 
146 |             numba_uniquant_1d,
147 |             self._sample(X),
148 |             max_bins=self.max_bin - 1,
149 |             min_data_in_bin=self.min_data_in_bin
150 | 
151 |         )
152 | 
153 |         return self
154 | 


--------------------------------------------------------------------------------
/py_boost/quantization/utils.py:
--------------------------------------------------------------------------------
  1 | """Quantization utilities"""
  2 | 
  3 | import numba
  4 | import numpy as np
  5 | from numba import float32, float64, uint8, prange, njit, int64
  6 | 
  7 | numba.config.THREADING_LAYER = 'threadsafe'
  8 | 
  9 | 
 10 | def _apply_borders_1d(x_raw, x_enc, borders):
 11 |     # encode raw values
 12 |     sl = np.nonzero(~np.isnan(x_raw))[0]
 13 |     x_enc[sl] = np.searchsorted(borders, x_raw[sl])
 14 | 
 15 |     return
 16 | 
 17 | 
 18 | sign = [(float64[:], uint8[:], float64[:]),
 19 |         (float32[:], uint8[:], float32[:]),
 20 |         ]
 21 | 
 22 | numba_apply_borders_1d = njit(sign, parallel=False)(_apply_borders_1d)
 23 | 
 24 | 
 25 | def _apply_borders(X, X_enc, borders):
 26 |     for i in prange(X.shape[1]):
 27 |         i = int64(i)  # to prevent unsafe cast numba warning
 28 |         numba_apply_borders_1d(X[:, i], X_enc[:, i], borders[i])
 29 | 
 30 |     return
 31 | 
 32 | 
 33 | numba_apply_borders = njit(parallel=True)(_apply_borders)
 34 | 
 35 | 
 36 | def apply_borders(X, borders):
 37 |     X_enc = np.zeros_like(X, dtype=np.uint8, order='C')
 38 |     numba_apply_borders(X, X_enc, numba.typed.List(borders))
 39 | 
 40 |     return X_enc
 41 | 
 42 | 
 43 | def _preprocess_1d(x_sample):
 44 |     x_sample = x_sample[~np.isnan(x_sample)].copy()
 45 |     neg_inf_clip_value = np.finfo(x_sample).min
 46 |     x_sample[x_sample < neg_inf_clip_value] = neg_inf_clip_value
 47 |     x_sample = np.sort(x_sample)
 48 | 
 49 |     return x_sample
 50 | 
 51 | 
 52 | sign = [(float64[:],), (float32[:],), ]
 53 | numba_preprocess_1d = njit(sign, parallel=False)(_preprocess_1d)
 54 | 
 55 | 
 56 | def _quantile_1d(x_sample, max_bins, min_data_in_bin):
 57 |     x_sample = numba_preprocess_1d(x_sample)
 58 |     bins = np.unique(x_sample)[:-1]
 59 | 
 60 |     if len(bins) > (max_bins - 1):
 61 |         # get quantiles
 62 |         grid = (np.linspace(0, 1, max_bins + 1) * x_sample.shape[0])[1:-1].astype(np.int64)
 63 |         bins = x_sample[grid]
 64 |         bins = np.unique(bins)
 65 | 
 66 |     return bins
 67 | 
 68 | 
 69 | q1d_sign = [(float64[:], int64, int64),
 70 |             (float32[:], int64, int64),
 71 |             ]
 72 | 
 73 | numba_quantile_1d = njit(q1d_sign, parallel=False)(_quantile_1d)
 74 | 
 75 | 
 76 | def _quantize_features(fn, X, max_bins, min_data_in_bin, borders):
 77 |     """
 78 |     Args:
 79 |         X:
 80 | 
 81 |     Returns:
 82 |     """
 83 |     for i in prange(X.shape[1]):
 84 |         bins = fn(X[:, i], max_bins, min_data_in_bin)
 85 |         borders[i, 1: len(bins) + 1] = bins
 86 | 
 87 |     return borders
 88 | 
 89 | 
 90 | numba_quantize_features = njit(parallel=True)(_quantize_features)
 91 | 
 92 | 
 93 | def quantize_features(fn, X, max_bins=255, min_data_in_bin=3):
 94 |     """
 95 |     Perform feature quantization
 96 |     Args:
 97 |         fn: JIT compiled function for 1d quantization
 98 |         X: np.ndarray, raw features
 99 |         max_bins: int, maximum number of bins, <= 255
100 |         min_data_in_bin: int, sample size for bins construction
101 |     Returns:
102 |     """
103 |     assert 0 < max_bins <= 255, 'Max bins should be between 0 and 255'
104 |     assert min_data_in_bin > 0, 'Min data in bin should be > 0'
105 | 
106 |     borders_ = np.empty((X.shape[1], max_bins + 1), dtype=X.dtype)
107 |     borders_[:] = np.nan
108 |     borders_[:, 0] = -np.inf
109 | 
110 |     numba_quantize_features(fn, X, max_bins, min_data_in_bin, borders_)
111 |     borders = []
112 | 
113 |     for i in range(X.shape[1]):
114 |         j = 0
115 |         for j in range(max_bins + 1):
116 |             val = borders_[i, j]
117 |             if np.isnan(val):
118 |                 break
119 |         borders_[i, j] = np.inf
120 |         borders.append(borders_[i, :j + 1])
121 | 
122 |     return borders
123 | 
124 | 
125 | def _uniform_1d(x_sample, max_bins, min_data_in_bin):
126 |     x_sample = numba_preprocess_1d(x_sample)
127 |     bins = np.unique(x_sample)[:-1]
128 | 
129 |     if len(bins) > (max_bins - 1):
130 |         # get uniform
131 |         bins = np.linspace(x_sample[0], x_sample[-1], max_bins + 1)[1:-1].astype(x_sample.dtype)
132 | 
133 |     return bins
134 | 
135 | 
136 | numba_uniform_1d = njit(q1d_sign, parallel=False)(_uniform_1d)
137 | 
138 | 
139 | def _uniquant_1d(x_sample, max_bins, min_data_in_bin):
140 |     x_sample = numba_preprocess_1d(x_sample)
141 |     bins = np.unique(x_sample)[:-1]
142 | 
143 |     if len(bins) > (max_bins - 1):
144 |         # get uniform
145 |         max_bins_u = max_bins // 2
146 |         bins_u = np.linspace(x_sample[0], x_sample[-1], max_bins_u + 1)[1:-1].astype(x_sample.dtype)
147 |         # get quantile
148 |         max_bins_q = max_bins - max_bins_u
149 |         grid = (np.linspace(0, 1, max_bins_q + 1) * x_sample.shape[0])[1:-1].astype(np.int64)
150 |         bins_q = x_sample[grid]
151 |         # merge
152 |         bins = np.unique(np.concatenate((bins_u, bins_q)))
153 | 
154 |     return bins
155 | 
156 | 
157 | numba_uniquant_1d = njit(q1d_sign, parallel=False)(_uniquant_1d)
158 | 


--------------------------------------------------------------------------------
/py_boost/sampling/__init__.py:
--------------------------------------------------------------------------------
1 | """Provides strategies to row/columns sampling"""
2 | 


--------------------------------------------------------------------------------
/py_boost/sampling/bagging.py:
--------------------------------------------------------------------------------
  1 | """Basic sampling strategy"""
  2 | 
  3 | import numpy as np
  4 | try:
  5 |     import cupy as cp
  6 | except Exception:
  7 |     pass
  8 | from ..callbacks.callback import Callback
  9 | 
 10 | 
 11 | class BaseSampler(Callback):
 12 |     """Random uniform rows/columns sampler"""
 13 | 
 14 |     def __init__(self, sample=1, axis=0):
 15 |         """
 16 | 
 17 |         Args:
 18 |             sample: subsample to select at each iteration
 19 |             axis: int, 0 for rows, 1 for columns
 20 |         """
 21 |         self.sample = sample
 22 |         self.axis = axis
 23 |         self.length = None
 24 |         self.valid_sl = None
 25 |         self.indexer = None
 26 | 
 27 |     def before_train(self, build_info):
 28 |         """Create indexers
 29 | 
 30 |         Args:
 31 |             build_info: dict
 32 | 
 33 |         Returns:
 34 | 
 35 |         """
 36 |         self.length = build_info['data']['train']['features_gpu'].shape[self.axis]
 37 |         self.indexer = cp.arange(self.length, dtype=cp.uint64)
 38 |         if self.sample < 1:
 39 |             self.valid_sl = cp.zeros(self.length, dtype=cp.bool_)
 40 |             self.valid_sl[:max(1, int(self.length * self.sample))] = True
 41 | 
 42 |     def before_iteration(self, build_info):
 43 |         """Shuffle indexers
 44 | 
 45 |         Args:
 46 |             build_info: dict
 47 | 
 48 |         Returns:
 49 | 
 50 |         """
 51 |         if self.sample < 1:
 52 |             cp.random.shuffle(self.valid_sl)
 53 | 
 54 |     def __call__(self):
 55 |         """Get the last actual indexer
 56 | 
 57 |         Returns:
 58 | 
 59 |         """
 60 |         if self.sample == 1:
 61 |             return self.indexer
 62 | 
 63 |         return self.indexer[self.valid_sl]
 64 | 
 65 |     def after_train(self, build_info):
 66 |         """Clean the state
 67 | 
 68 |         Args:
 69 |             build_info:
 70 | 
 71 |         Returns:
 72 | 
 73 |         """
 74 |         self.__init__(sample=self.sample, axis=self.axis)
 75 | 
 76 | 
 77 | class MVSSampler(Callback):
 78 |     """
 79 |     MVS rows sampler proposed in
 80 |     https://proceedings.neurips.cc/paper/2019/file/5c8cb735a1ce65dac514233cbd5576d6-Paper.pdf
 81 |     """
 82 | 
 83 |     def __init__(self, sample=0.1, lmbd='auto', grid_search_steps=100, grid_multiplier=100):
 84 |         """
 85 | 
 86 |         Args:
 87 |             sample: float, subsample
 88 |             lmbd: float or 'auto', lambda hyperparameter
 89 |             grid_search_steps: float, cut off search steps
 90 |             grid_multiplier: float, cut off search multiplier
 91 |         """
 92 |         self.sample = sample
 93 |         self.lmbd = lmbd
 94 |         self.grid_search_steps = grid_search_steps
 95 |         self.grid_multiplier = grid_multiplier
 96 |         self.indexer = None
 97 | 
 98 |     def get_probs(self, reg_grad):
 99 | 
100 |         min_ = reg_grad.min()
101 | 
102 |         grid = cp.linspace(min_, min_ * self.grid_multiplier, self.grid_search_steps, dtype=cp.float32)[cp.newaxis, :]
103 | 
104 |         probs = cp.clip(reg_grad[:, cp.newaxis] / grid, 0, 1)
105 |         sample_rates = probs.mean(axis=0)
106 |         best_idx = cp.abs(sample_rates - self.sample).argmin()
107 | 
108 |         return probs[:, best_idx]
109 | 
110 |     def before_train(self, build_info):
111 | 
112 |         return
113 | 
114 |     def before_iteration(self, build_info):
115 | 
116 |         train = build_info['data']['train']
117 |         grad, hess = train['grad'], train['hess']
118 | 
119 |         if self.lmbd == 'auto':
120 |             lmbd = ((grad.sum() / hess.sum()) ** 2).sum()
121 |         else:
122 |             lmbd = self.lmbd
123 | 
124 |         mult = grad.shape[1] / hess.shape[1]
125 | 
126 |         reg_grad = cp.sqrt((grad ** 2).sum(axis=1) + lmbd * (hess ** 2).sum(axis=1) * mult)
127 | 
128 |         probs = self.get_probs(reg_grad)
129 | 
130 |         build_info['data']['train']['grad'] = grad / probs[:, cp.newaxis]
131 |         sl = probs >= cp.random.rand(grad.shape[0], dtype=cp.float32)
132 |         self.indexer = cp.arange(grad.shape[0], dtype=cp.uint64)[sl]
133 | 
134 |     def __call__(self, *args, **kwargs):
135 | 
136 |         return self.indexer
137 | 
138 |     def after_train(self, build_info):
139 | 
140 |         self.indexer = None
141 | 
142 | 
143 | class ColumnImportanceSampler(Callback):
144 |     """
145 |     This class implements a sampling strategy,
146 |     that sample columns in proportion to thier importance at each step
147 |     """
148 | 
149 |     def __init__(self, rate=0.5, smooth=0.1,
150 |                  update_freq=10, inverse=False, n_force=None, imp_type='split'):
151 |         """
152 | 
153 |         Args:
154 |             rate: float, sampling rate
155 |             smooth: float, smoothing parameter
156 |             update_freq: int importance update frequency
157 |             inverse: inverse the probability of sampling
158 |             n_force: int or None, number of feats to ignore by sample (always select), counts from the end of data
159 |             imp_type: str, importance type
160 | 
161 |         Returns:
162 | 
163 |         """
164 |         self.rate = rate
165 |         self.smooth = smooth
166 |         self.update_freq = update_freq
167 |         self.inverse = inverse
168 |         self.n_force = n_force
169 |         self.imp_type = imp_type
170 |         self.p = None
171 |         self.imp = None
172 | 
173 |     def update_importance(self, model):
174 | 
175 |         if self.imp is None:
176 |             self.imp = model.get_feature_importance(self.imp_type)
177 |             return self.imp
178 | 
179 |         for tree in model.models[-self.update_freq:]:
180 |             if self.imp_type == 'split':
181 |                 self.imp += tree.feature_importance_split
182 |             else:
183 |                 self.imp += tree.feature_importance_gain
184 | 
185 |         return self.imp
186 | 
187 |     def before_iteration(self, build_info):
188 |         """
189 |         Define what should be doe before each iteration
190 |         """
191 |         # Update feature importance
192 |         num_iter = build_info['num_iter']
193 | 
194 |         if (num_iter % self.update_freq) == 0:
195 |             # update probabilities with actual importance
196 |             p = self.update_importance(build_info['model']) + 1e-3
197 | 
198 |             if self.n_force is not None:
199 |                 p = p[:-self.n_force]
200 | 
201 |             p = cp.asarray(p) / (p.sum())
202 |             # inverse if needed
203 |             if self.inverse:
204 |                 p = 1 - p
205 |                 p = p / p.sum()
206 |             # apply smoothing
207 |             self.p = p * (1 - self.smooth) + cp.ones_like(p) * self.smooth / p.shape[0]
208 | 
209 |     def __call__(self):
210 |         """
211 |         Method should return the array of indices, that will be used
212 |         to grow the tree at the current step
213 |         """
214 |         # Sample rows
215 |         n = self.p.shape[0]
216 |         index = cp.random.choice(cp.arange(n, dtype=cp.uint64),
217 |                                  size=int(self.rate * n), p=self.p)
218 | 
219 |         if self.n_force is not None:
220 |             index = cp.concatenate([index, cp.arange(n, n + self.n_force, dtype=cp.uint64)])
221 | 
222 |         return index
223 | 
224 |     def after_train(self, build_info):
225 | 
226 |         self.p = None
227 |         self.imp = None
228 | 


--------------------------------------------------------------------------------
/py_boost/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Provides auxiliary utilities"""
2 | 


--------------------------------------------------------------------------------
/py_boost/utils/logging.py:
--------------------------------------------------------------------------------
  1 | """Utils for logging."""
  2 | 
  3 | import io
  4 | import os
  5 | import sys
  6 | 
  7 | import logging
  8 | from .. import _logger
  9 | 
 10 | formatter_debug = logging.Formatter("%(asctime)s\t[%(levelname)s]\t%(pathname)s.%(funcName)s:%(lineno)d\t%(message)s")
 11 | formatter_default = logging.Formatter("[%(asctime)s] %(message)s", "%H:%M:%S")
 12 | 
 13 | INFO2 = 17
 14 | INFO3 = 13
 15 | 
 16 | 
 17 | def add_logging_level(levelName, levelNum, methodName=None):
 18 |     """
 19 |     Comprehensively adds a new logging level to the `logging` module and the
 20 |     currently configured logging class.
 21 |     `levelName` becomes an attribute of the `logging` module with the value
 22 |     `levelNum`. `methodName` becomes a convenience method for both `logging`
 23 |     itself and the class returned by `logging.getLoggerClass()` (usually just
 24 |     `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
 25 |     used.
 26 |     To avoid accidental clobberings of existing attributes, this method will
 27 |     raise an `AttributeError` if the level name is already an attribute of the
 28 |     `logging` module or if the method name is already present
 29 |     Example
 30 |     -------
 31 |     >>> addLoggingLevel('TRACE', logging.DEBUG - 5)
 32 |     >>> logging.getLogger(__name__).setLevel("TRACE")
 33 |     >>> logging.getLogger(__name__).trace('that worked')
 34 |     >>> logging.trace('so did this')
 35 |     >>> logging.TRACE
 36 |     5
 37 |     """
 38 |     assert (levelNum > 0) and (levelNum < 50)
 39 |     if not methodName:
 40 |         methodName = levelName.lower()
 41 | 
 42 |     if hasattr(logging, levelName):
 43 |         if levelNum == logging.__dict__[levelName]:
 44 |             print("Level \"{}: {}\" already defined, skipping...".format(levelName, levelNum))
 45 |             return
 46 |         else:
 47 |             raise AttributeError("{} already defined in logging module".format(levelName))
 48 |     if hasattr(logging, methodName):
 49 |         raise AttributeError("{} already defined in logging module".format(methodName))
 50 |     if hasattr(logging.getLoggerClass(), methodName):
 51 |         raise AttributeError("{} already defined in logger class".format(methodName))
 52 | 
 53 |     def logForLevel(self, message, *args, **kwargs):
 54 |         if self.isEnabledFor(levelNum):
 55 |             self._log(levelNum, message, args, **kwargs)
 56 | 
 57 |     def logToRoot(message, *args, **kwargs):
 58 |         logging.log(levelNum, message, *args, **kwargs)
 59 | 
 60 |     logging.addLevelName(levelNum, levelName)
 61 |     setattr(logging, levelName, levelNum)
 62 |     setattr(logging.getLoggerClass(), methodName, logForLevel)
 63 |     setattr(logging, methodName, logToRoot)
 64 | 
 65 | 
 66 | add_logging_level("INFO2", INFO2)
 67 | add_logging_level("INFO3", INFO3)
 68 | 
 69 | 
 70 | class LoggerStream(io.IOBase):
 71 |     def __init__(self, logger, verbose_eval=100) -> None:
 72 |         super().__init__()
 73 |         self.logger = logger
 74 |         self.verbose_eval = verbose_eval
 75 |         self.counter = 1
 76 | 
 77 |     def write(self, message):
 78 |         if message == "\n":
 79 |             return
 80 |         iter_num = message.split("\t")[0]
 81 |         if (iter_num == "[1]") or (iter_num == "0:") or ((iter_num[-1] != "]") and (iter_num[-1] != ":")):
 82 |             self.logger.info3(message.rstrip())
 83 |             return
 84 | 
 85 |         if self.counter < self.verbose_eval - 1:
 86 |             self.logger.debug(message.rstrip())
 87 |             self.counter += 1
 88 |         else:
 89 |             self.logger.info3(message.rstrip())
 90 |             self.counter = 0
 91 | 
 92 | 
 93 | def verbosity_to_loglevel(verbosity: int, extended=True):
 94 |     if extended:
 95 |         if verbosity <= 0:
 96 |             log_level = logging.ERROR
 97 |         elif verbosity == 1:
 98 |             log_level = logging.INFO
 99 |         elif verbosity == 2:
100 |             log_level = logging.INFO2
101 |         elif verbosity == 3:
102 |             log_level = logging.INFO3
103 |         else:
104 |             log_level = logging.DEBUG
105 |     else:
106 |         if verbosity <= 0:
107 |             log_level = logging.ERROR
108 |         elif verbosity == 1:
109 |             log_level = logging.INFO
110 |         else:
111 |             log_level = logging.DEBUG
112 | 
113 |     return log_level
114 | 
115 | 
116 | def get_stdout_level():
117 |     for handler in _logger.handlers:
118 |         if type(handler) == logging.StreamHandler:
119 |             return handler.level
120 |     return _logger.getEffectiveLevel()
121 | 
122 | 
123 | def set_stdout_level(level):
124 |     _logger.setLevel(logging.DEBUG)
125 | 
126 |     has_console_handler = False
127 | 
128 |     for handler in _logger.handlers:
129 |         if type(handler) == logging.StreamHandler:
130 |             if handler.level == level:
131 |                 has_console_handler = True
132 |             else:
133 |                 _logger.handlers.remove(handler)
134 | 
135 |     if not has_console_handler:
136 |         handler = logging.StreamHandler(sys.stdout)
137 |         handler.setFormatter(formatter_default)
138 |         handler.setLevel(level)
139 | 
140 |         _logger.addHandler(handler)
141 | 
142 | 
143 | def add_filehandler(filename: str, level=logging.DEBUG):
144 |     if filename:
145 |         has_file_handler = False
146 | 
147 |         for handler in _logger.handlers:
148 |             if type(handler) == logging.FileHandler:
149 |                 if handler.baseFilename == filename or handler.baseFilename == os.path.join(os.getcwd(), filename):
150 |                     has_file_handler = True
151 |                 else:
152 |                     _logger.handlers.remove(handler)
153 | 
154 |         if not has_file_handler:
155 |             file_handler = logging.FileHandler(filename, mode="w")
156 | 
157 |             if level == logging.DEBUG:
158 |                 file_handler.setFormatter(formatter_debug)
159 |             else:
160 |                 file_handler.setFormatter(formatter_default)
161 | 
162 |             file_handler.setLevel(level)
163 | 
164 |             # if handler_filter:
165 |             #     file_handler.addFilter(handler_filter)
166 | 
167 |             _logger.addHandler(file_handler)
168 |     else:
169 |         for handler in _logger.handlers:
170 |             if type(handler) == logging.FileHandler:
171 |                 _logger.handlers.remove(handler)
172 | 
173 | 
174 | class DuplicateFilter(object):
175 |     def __init__(self):
176 |         self.msgs = set()
177 | 
178 |     def filter(self, record):
179 |         rv = record.msg not in self.msgs
180 |         self.msgs.add(record.msg)
181 |         return rv
182 | 


--------------------------------------------------------------------------------
/py_boost/utils/onnx_wrapper.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | import numpy as np
  4 | import onnx
  5 | import onnxruntime
  6 | import tqdm
  7 | from onnx import helper, TensorProto
  8 | from onnx.checker import check_model
  9 | 
 10 | 
 11 | def pb_to_onnx(model, output, fltr=None, post_transform='NONE'):
 12 |     """Parse the model to ONNX format
 13 | 
 14 |     Args:
 15 |         model: Py-Boost Ensemble
 16 |         output: str, file path
 17 |         fltr: str or None, if subset of outputs need to be stored
 18 |         post_transform: str, one of 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', or 'PROBIT'
 19 | 
 20 |     Returns:
 21 | 
 22 |     """
 23 |     model.to_cpu()
 24 | 
 25 |     nout = model.base_score.shape[0]
 26 |     if fltr is None:
 27 |         fltr = np.arange(nout)
 28 |     else:
 29 |         fltr_ = np.asarray(fltr)
 30 |         fltr = np.sort(fltr_)
 31 |         if (fltr != fltr_).any():
 32 |             warnings.warn(
 33 |                 'Selected outputs order changed. Predictions will keep the original model order (fltr array is sorted)'
 34 |             )
 35 | 
 36 |     nout = len(fltr)
 37 | 
 38 |     parsed_ensemble = {
 39 | 
 40 |         # const for ensemble
 41 |         "base_values": model.base_score[fltr].tolist(),
 42 |         "n_targets": nout,
 43 |         "aggregate_function": "SUM",
 44 |         "post_transform": post_transform
 45 | 
 46 |     }
 47 | 
 48 |     nodes_attr = [
 49 |         "nodes_treeids", "nodes_nodeids", "nodes_modes", "nodes_falsenodeids",
 50 |         "nodes_truenodeids", "nodes_featureids", "nodes_values", "nodes_missing_value_tracks_true"
 51 |     ]
 52 | 
 53 |     leaves_attr = [
 54 |         "target_ids", "target_nodeids", "target_treeids", "target_weights"
 55 |     ]
 56 | 
 57 |     for key in nodes_attr + leaves_attr:
 58 |         parsed_ensemble[key] = []
 59 | 
 60 |     k = 0
 61 |     for tree in tqdm.tqdm(model.models):
 62 | 
 63 |         g = 0
 64 |         for offset in tree.test_format_offsets:
 65 | 
 66 |             offset = offset * 4
 67 |             outputs = np.setdiff1d(fltr, np.nonzero(tree.group_index != g)[0])
 68 |             # old id and new id
 69 |             nodes, n = [(0, 0)], 0
 70 | 
 71 |             while len(nodes) > 0:
 72 | 
 73 |                 # placeholder for new nodes
 74 |                 new_nodes = []
 75 | 
 76 |                 # first - adding the node
 77 |                 for old, new in nodes:
 78 | 
 79 |                     parsed_ensemble["nodes_treeids"].append(k)
 80 |                     parsed_ensemble["nodes_nodeids"].append(new)
 81 | 
 82 |                     if old >= 0:
 83 |                         # case - split node
 84 |                         i = old * 4
 85 |                         f, s, l, r = tree.test_format[offset + i: offset + i + 4]
 86 |                         f, l, r = int(f), int(l), int(r)
 87 | 
 88 |                         parsed_ensemble["nodes_modes"].append("BRANCH_LEQ")
 89 |                         # check NaN condition
 90 |                         nan_left = f < 0
 91 |                         f = abs(f) - 1
 92 | 
 93 |                         parsed_ensemble["nodes_truenodeids"].append(n + 1)
 94 |                         parsed_ensemble["nodes_falsenodeids"].append(n + 2)
 95 |                         parsed_ensemble["nodes_missing_value_tracks_true"].append(nan_left)
 96 |                         parsed_ensemble["nodes_featureids"].append(f)
 97 |                         parsed_ensemble["nodes_values"].append(float(s))
 98 |                         new_nodes.extend([(l, n + 1), (r, n + 2)])
 99 |                         n = n + 2
100 | 
101 |                     else:
102 |                         # case leaf node
103 |                         leaf = abs(old) - 1
104 |                         parsed_ensemble["nodes_modes"].append("LEAF")
105 |                         # add dummy children info
106 |                         parsed_ensemble["nodes_truenodeids"].append(-1)
107 |                         parsed_ensemble["nodes_falsenodeids"].append(-1)
108 |                         parsed_ensemble["nodes_missing_value_tracks_true"].append(False)
109 |                         parsed_ensemble["nodes_featureids"].append(-1)
110 |                         parsed_ensemble["nodes_values"].append(0.0)
111 |                         # add leaf info
112 |                         for j, o in zip(outputs, np.searchsorted(fltr, outputs)):
113 |                             parsed_ensemble["target_ids"].append(o)
114 |                             parsed_ensemble["target_nodeids"].append(new)
115 |                             parsed_ensemble["target_treeids"].append(k)
116 |                             parsed_ensemble["target_weights"].append(float(tree.values[leaf, j]))
117 | 
118 |                 nodes = new_nodes
119 | 
120 |             k += 1
121 |             g += 1
122 | 
123 |     # create a model
124 |     node_proto = helper.make_node(
125 |         op_type="TreeEnsembleRegressor",
126 |         inputs=["X"], outputs=["Y"],
127 |         domain='ai.onnx.ml',
128 |     )
129 |     node_proto.attribute.extend([helper.make_attribute(x, parsed_ensemble[x]) for x in parsed_ensemble])
130 | 
131 |     X_ft = helper.make_tensor_value_info('X', TensorProto.FLOAT, [None, None])
132 |     Y_out = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [None, nout])
133 | 
134 |     graph_def = helper.make_graph(
135 |         [node_proto],  # nodes
136 |         'py-boost-ensemble',  # name
137 |         [X_ft],  # inputs
138 |         [Y_out]  # outputs
139 |     )
140 | 
141 |     model_def = helper.make_model(
142 |         graph_def, producer_name="Py-Boost",
143 |         opset_imports=[
144 |             onnx.helper.make_opsetid('ai.onnx.ml', 3),
145 |             onnx.helper.make_opsetid('', 16),
146 |         ]
147 |     )
148 | 
149 |     check_model(model_def)
150 | 
151 |     with open(output, "wb") as f:
152 |         f.write(model_def.SerializeToString())
153 | 
154 |     return
155 | 
156 | 
157 | class ONNXPredictor:
158 |     """
159 |     ONNX parser and CPU predictor. Could be used for inference of Py-Boost model on CPU
160 |     """
161 | 
162 |     def __init__(self, model, filepath, postprocess_fn=None, fltr=None, post_transform='NONE'):
163 |         """
164 | 
165 |         Args:
166 |             model: Py-Boost model
167 |             filepath: str, filepath to save
168 |             postprocess_fn: Callable or None, python postprocess_fn. If passed, model postprocessing will be ignored
169 |                 and replaced
170 |             fltr: Sequence, indices to use for inference if needed to filter
171 |             post_transform: str, one of 'NONE', ‘SOFTMAX,’ ‘LOGISTIC,’ ‘SOFTMAX_ZERO,’ or ‘PROBIT’.
172 |                 Built-in ONNX post_transform function. If passed, both model postprocessing and python postprocess_fn
173 |                 will be ignored
174 |         """
175 |         if model is not None:
176 |             pb_to_onnx(model, output=filepath, fltr=fltr, post_transform=post_transform)
177 | 
178 |         self.filepath = filepath
179 | 
180 |         # store post transform fn
181 |         if post_transform != 'NONE':
182 |             self.postprocess_fn = None
183 |         else:
184 |             self.postprocess_fn = postprocess_fn
185 |             if postprocess_fn is None and model is not None:
186 |                 self.postprocess_fn = model.postprocess_fn
187 | 
188 |         self.sess = None
189 |         self._start_session()
190 | 
191 |     @classmethod
192 |     def from_onnx(cls, filepath, postprocess_fn=None):
193 |         """Create ONNX predictor from parsed model
194 | 
195 |         Args:
196 |             filepath: str, file path
197 |             postprocess_fn: Callable or None
198 | 
199 |         Returns:
200 | 
201 |         """
202 |         return cls(None, filepath, postprocess_fn=postprocess_fn, fltr=None, post_transform='NONE')
203 | 
204 |     def _start_session(self):
205 |         """Start inference session
206 | 
207 |         Returns:
208 | 
209 |         """
210 |         self.sess = onnxruntime.InferenceSession(
211 |             self.filepath,
212 |             providers=["CPUExecutionProvider"]
213 |         )
214 | 
215 |         return
216 | 
217 |     def predict(self, X):
218 |         """Predict with ONNX runtime
219 | 
220 |         Args:
221 |             X: np.ndarray, feature matrix
222 | 
223 |         Returns:
224 |             np.ndarray
225 |         """
226 | 
227 |         X = X.astype(np.float32, copy=False)
228 |         preds = self.sess.run(['Y'], {'X': X})[0]
229 | 
230 |         if self.postprocess_fn is not None:
231 |             preds = self.postprocess_fn(preds)
232 | 
233 |         return preds
234 | 


--------------------------------------------------------------------------------
/py_boost/utils/tl_wrapper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import joblib
  4 | from tqdm import tqdm
  5 | import treelite
  6 | import treelite_runtime as tl_run
  7 | 
  8 | 
  9 | def _create_node_deprecated(tree, node_id):
 10 |     """(DEPRECATED) Create a node of treelite tree
 11 |     Args:
 12 |         tree: Py-Boost Tree, tree to parse
 13 |         node_id: int, node index
 14 |     Returns:
 15 |         dict, args of treelite.ModelBuilder.Tree .set_numerical_test_node or .set_leaf_node
 16 |     """
 17 |     feature_id = tree.feats[0][node_id]
 18 | 
 19 |     if feature_id >= 0:
 20 |         left, right = tuple(tree.split[0][node_id])
 21 |         node = {
 22 | 
 23 |             'feature_id': feature_id,
 24 |             'opname': '<=',
 25 |             'threshold': tree.val_splits[0][node_id],
 26 |             'default_left': tree.nans[0][node_id],
 27 |             'left_child_key': left,
 28 |             'right_child_key': right,
 29 |         }
 30 | 
 31 |         return node, left, right
 32 | 
 33 |     return {'value': tree.values[tree.leaves[node_id][0]]}, None, None
 34 | 
 35 | 
 36 | def create_node(tree, node_id, id_gen):
 37 |     """Create a node of treelite tree
 38 | 
 39 |     Args:
 40 |         tree: Py-Boost Tree, tree to parse
 41 |         node_id: int, node index in original format tree
 42 |         id_gen: generator, new id generator
 43 | 
 44 |     Returns:
 45 |         dict, args of treelite.ModelBuilder.Tree .set_numerical_test_node
 46 |     """
 47 | 
 48 |     assert node_id >= 0
 49 | 
 50 |     feature_id = int(tree.test_format[node_id * 4])
 51 |     nan_left = feature_id < 0
 52 |     feature_id = abs(feature_id) - 1
 53 | 
 54 |     left = int(tree.test_format[node_id * 4 + 2])
 55 |     right = int(tree.test_format[node_id * 4 + 3])
 56 |     new_id_left = next(id_gen)
 57 |     new_id_right = next(id_gen)
 58 |     node = {
 59 |         'feature_id': feature_id,
 60 |         'opname': '<=',
 61 |         'threshold': tree.test_format[node_id * 4 + 1],
 62 |         'default_left': nan_left,
 63 |         'left_child_key': new_id_left,
 64 |         'right_child_key': new_id_right,
 65 |     }
 66 | 
 67 |     return node, (left, new_id_left), (right, new_id_right)
 68 | 
 69 | 
 70 | def parse_pb_tree(tree):
 71 |     """Parse s single Py-Boost Tree to treelite.ModelBuilder.Tree format
 72 | 
 73 |     Args:
 74 |         tree: Py-Boost tree
 75 | 
 76 |     Returns:
 77 |         treelite.ModelBuilder.Tree
 78 |     """
 79 |     assert tree.ngroups == 1, 'Models with more than 1 group are not currently supported'
 80 | 
 81 |     def id_generator():
 82 |         id_num = 1
 83 |         while True:
 84 |             yield id_num
 85 |             id_num += 1
 86 |     id_gen = id_generator()
 87 | 
 88 |     tl_tree = treelite.ModelBuilder.Tree()
 89 |     curr_nodes = [(0, 0)]  # (old_id, new_id)
 90 | 
 91 |     while len(curr_nodes) > 0:
 92 |         old_id, new_id = curr_nodes.pop(0)
 93 |         curr_node, left, right = create_node(tree, old_id, id_gen)
 94 |         tl_tree[new_id].set_numerical_test_node(**curr_node)
 95 | 
 96 |         if left[0] >= 0:
 97 |             curr_nodes.append(left)
 98 |         else:
 99 |             tl_tree[left[1]].set_leaf_node(tree.values[abs(left[0]) - 1])
100 |         if right[0] >= 0:
101 |             curr_nodes.append(right)
102 |         else:
103 |             tl_tree[right[1]].set_leaf_node(tree.values[abs(right[0]) - 1])
104 | 
105 |     tl_tree[0].set_root()
106 |     return tl_tree
107 | 
108 | 
109 | def _parse_pb_tree_deprecated(tree):
110 |     """Parse s single Py-Boost Tree to treelite.ModelBuilder.Tree format
111 | 
112 |     Args:
113 |         tree: Py-Boost tree
114 | 
115 |     Returns:
116 |         treelite.ModelBuilder.Tree
117 |     """
118 |     assert tree.ngroups == 1, 'Models with more than 1 group are not currently supported'
119 | 
120 |     tl_tree = treelite.ModelBuilder.Tree()
121 |     curr_nodes = [0]
122 | 
123 |     while len(curr_nodes) > 0:
124 | 
125 |         next_nodes = []
126 | 
127 |         for node_id in curr_nodes:
128 | 
129 |             curr_node, left, right = _create_node_deprecated(tree, node_id)
130 |             # add node
131 |             tl_tree[node_id]
132 |             if left is not None:
133 |                 tl_tree[node_id].set_numerical_test_node(**curr_node)
134 |                 next_nodes.extend([left, right])
135 |             else:
136 |                 tl_tree[node_id].set_leaf_node(curr_node['value'])
137 | 
138 |         curr_nodes = next_nodes
139 | 
140 |     tl_tree[0].set_root()
141 | 
142 |     return tl_tree
143 | 
144 | 
145 | def convert_pb_to_treelite(model):
146 |     """Convert Py-Boost Ensemble instance to the treelite.ModelBuilder.Tree
147 | 
148 |     Args:
149 |         model: Py-Boost Tree
150 | 
151 |     Returns:
152 |         treelite.ModelBuilder.Tree
153 |     """
154 |     nfeats = model.nfeats
155 |     ngroups = model.models[0].values.shape[1]
156 | 
157 |     builder = treelite.ModelBuilder(
158 |         num_feature=nfeats,
159 |         num_class=ngroups,
160 |         pred_transform='identity_multiclass' if ngroups > 1 else 'identity'
161 |     )
162 | 
163 |     for tree in tqdm(model.models):
164 |         builder.append(parse_pb_tree(tree))
165 | 
166 |     # add bias tree
167 |     bias_tree = treelite.ModelBuilder.Tree()
168 |     bias_tree[0].set_numerical_test_node(**{
169 | 
170 |         'feature_id': 0,
171 |         'opname': '<',
172 |         'threshold': 0,
173 |         'default_left': True,
174 |         'left_child_key': 1,
175 |         'right_child_key': 2
176 |     })
177 | 
178 |     for i in range(1, 3):
179 |         bias_tree[i].set_leaf_node(model.base_score)
180 | 
181 |     bias_tree[0].set_root()
182 |     builder.append(bias_tree)
183 | 
184 |     return builder
185 | 
186 | 
187 | class TLCompiledPredictor:
188 |     """
189 |     Compiled treelite model saved to predict
190 |     """
191 | 
192 |     @staticmethod
193 |     def _default_postprocess_fn(x):
194 |         return x
195 | 
196 |     def __init__(self, libpath, nthread=None, verbose=False, postprocess_fn=None):
197 |         """
198 | 
199 |         Args:
200 |             libpath: str, path to compiled model
201 |             nthread: int or None, number of threads to use
202 |             verbose: bool, verbosity mode
203 |             postprocess_fn: Callable or None, prediction postprocessing function
204 |         """
205 |         self.verbose = verbose
206 |         self.nthread = nthread
207 |         self.libpath = None
208 |         self.set_libpath(libpath)
209 | 
210 |         self.postprocess_fn = self._default_postprocess_fn
211 |         if postprocess_fn is not None:
212 |             self.postprocess_fn = postprocess_fn
213 | 
214 |     def predict(self, X):
215 |         """Make prediction
216 | 
217 |         Args:
218 |             X: np.ndarray
219 | 
220 |         Returns:
221 |             np.ndarray
222 |         """
223 |         pred = self.predictor.predict(tl_run.DMatrix(X))
224 |         return self.postprocess_fn(pred)
225 | 
226 |     def set_libpath(self, libpath=None, nthread=None):
227 |         """Update library path
228 | 
229 |         Args:
230 |             libpath:
231 |             nthread: int, num threads
232 | 
233 |         Returns:
234 | 
235 |         """
236 |         if libpath is None:
237 |             libpath = self.libpath
238 |         if nthread is None:
239 |             nthread = self.nthread
240 |         self.libpath = os.path.abspath(libpath)
241 |         self.predictor = tl_run.Predictor(self.libpath, nthread=nthread, verbose=self.verbose)
242 | 
243 |     def dump(self, filename):
244 |         """Dump instance
245 | 
246 |         Args:
247 |             filename: str, path to save
248 | 
249 |         Returns:
250 | 
251 |         """
252 |         self.predictor = None
253 |         joblib.dump(self, filename)
254 | 
255 |     @staticmethod
256 |     def load(filename):
257 |         """Load instance
258 | 
259 |         Args:
260 |             filename: str, filename
261 | 
262 |         Returns:
263 |             TLCompiledPredictor
264 |         """
265 |         predictor = joblib.load(filename)
266 |         predictor.set_libpath()
267 | 
268 |         return predictor
269 | 
270 | 
271 | class TLPredictor:
272 |     """
273 |     Treelite predictor. Could be use for inference via built-in treelite utils
274 |     or to compilation to get TLCompiledPredictor
275 |     """
276 | 
277 |     def __init__(self, model, postprocess_fn=None):
278 |         """
279 | 
280 |         Args:
281 |             model: Py-Boost Ensemble
282 |             postprocess_fn: Callable or None, postprocessing function
283 |         """
284 |         model.to_cpu()
285 |         self.tl_model = convert_pb_to_treelite(model).commit()
286 | 
287 |         self.postprocess_fn = postprocess_fn
288 |         if postprocess_fn is None:
289 |             self.postprocess_fn = model.postprocess_fn
290 | 
291 |     def set_tl_model(self, tl_model):
292 |         """Update underlying treelite model
293 | 
294 |         Args:
295 |             tl_model:
296 | 
297 |         Returns:
298 | 
299 |         """
300 |         self.tl_model = tl_model
301 | 
302 |     def compile(
303 |             self,
304 |             toolchain,
305 |             libpath,
306 |             params=None,
307 |             compiler='ast_native',
308 |             verbose=False,
309 |             nthread=None,
310 |             options=None,
311 |             predictor_params=None
312 |     ):
313 |         """Compile model for faster inference. For the details please see
314 |         https://treelite.readthedocs.io/en/latest/tutorials/first.html
315 | 
316 |         Args:
317 |             toolchain:
318 |             libpath:
319 |             params:
320 |             compiler:
321 |             verbose:
322 |             nthread:
323 |             options:
324 |             predictor_params:
325 | 
326 |         Returns:
327 | 
328 |         """
329 | 
330 |         params = {} if params is None else params
331 |         params = {**{'parallel_comp': os.cpu_count(), }, **params}
332 | 
333 |         self.tl_model.export_lib(toolchain, libpath,
334 |                                  params, compiler, verbose, nthread, options)
335 | 
336 |         if predictor_params is None:
337 |             predictor_params = {}
338 |         predictor_params = {**{'nthread': nthread}, **predictor_params}
339 | 
340 |         predictor = TLCompiledPredictor(libpath, postprocess_fn=self.postprocess_fn, **predictor_params)
341 |         return predictor
342 | 
343 |     def predict(self, X, nthread=None):
344 |         """Make prediction
345 | 
346 |         Args:
347 |             X: np.ndarray
348 |             nthread: int/None, used for prediction
349 | 
350 |         Returns:
351 |             np.ndarray
352 |         """
353 |         if nthread is None:
354 |             nthread = os.cpu_count()
355 |         pred = treelite.gtil.predict(self.tl_model, X, nthread=nthread)
356 |         return self.postprocess_fn(pred)
357 | 
358 |     def dump(self, dirname, rewrite=False):
359 |         """Dump treelite Model and predictor instance
360 | 
361 |         Args:
362 |             dirname: str, path to save
363 |             rewrite: bool, possible to overwrite
364 | 
365 |         Returns:
366 | 
367 |         """
368 |         os.makedirs(dirname, exist_ok=rewrite)
369 |         temp = self.tl_model
370 |         self.tl_model = None
371 |         temp.serialize(os.path.join(dirname, 'model.mod'))
372 |         joblib.dump(self, os.path.join(dirname, 'predictor.pkl'))
373 |         self.tl_model = temp
374 | 
375 |     @staticmethod
376 |     def load(dirname):
377 |         """Load predictor from folder
378 | 
379 |         Args:
380 |             dirname: str, path
381 | 
382 |         Returns:
383 |             TLPredictor
384 |         """
385 |         predictor = joblib.load(os.path.join(dirname, 'predictor.pkl'))
386 |         predictor.set_tl_model(treelite.Model.deserialize(os.path.join(dirname, 'model.mod')))
387 | 
388 |         return predictor
389 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "Py-Boost"
 3 | version = "0.5.1"
 4 | description = "Python based GBDT"
 5 | 
 6 | authors = [
 7 |     "Vakhrushev Anton <btbpanda@gmail.com>",
 8 |     "Iosipoi Leonid",
 9 |     "Sergey Kupriyanov"
10 | ]
11 | 
12 | readme = "README.md"
13 | 
14 | repository = "https://github.com/sb-ai-lab/Py-Boost"
15 | classifiers = [
16 |     "Programming Language :: Python :: 3.8",
17 |     "Programming Language :: Python :: 3.9",
18 |     "Programming Language :: Python :: 3.10",
19 |     "Programming Language :: Python :: 3.11",
20 |     "Operating System :: OS Independent",
21 |     "Intended Audience :: Science/Research",
22 |     "Development Status :: 3 - Alpha",
23 |     "Environment :: Console",
24 |     "Natural Language :: English",
25 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
26 | ]
27 | 
28 | [tool.poetry.dependencies]
29 | 
30 | python = ">=3.8, <3.12"
31 | 
32 | scikit-learn = ">=1"
33 | numpy = "*"
34 | joblib = "*"
35 | numba = "*"
36 | ujson = '*'
37 | 
38 | pandas = ">=1"
39 | onnx = ">=1.16, <2"
40 | onnxruntime = ">=1.16, <2"
41 | treelite = "^3"
42 | treelite_runtime = "^3"
43 | 
44 | tqdm = ">=4.64.1"
45 | 
46 | [build-system]
47 | requires = ["poetry-core>=1.0.0"]
48 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/tutorials/Tutorial_3_Custom_features.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## This tutorial shows how to build custom features in py-boost"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Imports"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "# Optional: set the device to run\n",
 25 |     "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
 26 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
 27 |     "\n",
 28 |     "os.makedirs('../data', exist_ok=True)\n",
 29 |     "\n",
 30 |     "import joblib\n",
 31 |     "from sklearn.datasets import make_regression\n",
 32 |     "\n",
 33 |     "# simple case - just one class is used\n",
 34 |     "from py_boost import GradientBoosting "
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### Generate dummy regression data"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "CPU times: user 2.34 s, sys: 1.7 s, total: 4.05 s\n",
 54 |       "Wall time: 849 ms\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "%%time\n",
 60 |     "X, y = make_regression(150000, 100, n_targets=10, random_state=42)\n",
 61 |     "\n",
 62 |     "# we need non negative targets for this example\n",
 63 |     "y = y - y.min(axis=0)\n",
 64 |     "\n",
 65 |     "X_test, y_test = X[:50000], y[:50000]\n",
 66 |     "X, y = X[-50000:], y[-50000:]"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Custom Loss\n",
 74 |     "\n",
 75 |     "As it was mentioned in Tutorial_1, not only string alias is valid value for the loss function, but also the instance of Loss class, which is parent class for all loss function\n",
 76 |     "\n",
 77 |     "#### Now let's build our own MSLE (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html) loss function\n",
 78 |     "\n",
 79 |     "**Note**: Actually we have the built-in MSLE, so you still could use strinng alias for it"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "import cupy as cp\n",
 89 |     "from py_boost.gpu.losses import Loss, Metric\n",
 90 |     "\n",
 91 |     "class CustomRMSLEMetric(Metric):\n",
 92 |     "    \"\"\"First, let's define eval metric to estimate model quality while training\"\"\"\n",
 93 |     "    \n",
 94 |     "    def error(self, y_true, y_pred):\n",
 95 |     "        \"\"\"\n",
 96 |     "        The simpliest way do define a custom metric is to define .error method\n",
 97 |     "        Just tell py_boost how to calculate error at the each point, for out case it is possible\n",
 98 |     "        If it is not possible (for ex. ROC-AUC), you should define __call__ method\n",
 99 |     "        See the Metric class for the details\n",
100 |     "        \n",
101 |     "        At that stage y_true is already in GPU memory, so we use CuPy to handle it.\n",
102 |     "        Usage is the same as NumPy, just replace np with cp\n",
103 |     "        \n",
104 |     "        Note: the metric is calculated against processed input (see CustomMSLELoss below)\n",
105 |     "        \"\"\"\n",
106 |     "        return (cp.log1p(y_true) - cp.log1p(y_pred)) ** 2\n",
107 |     "    \n",
108 |     "    def compare(self, v0 ,v1):\n",
109 |     "        \"\"\"\n",
110 |     "        The last required method is .compare\n",
111 |     "        It should return True if v0 metric value is better than v1, False othewise\n",
112 |     "        \"\"\"\n",
113 |     "        return v0 < v1\n",
114 |     "    \n",
115 |     "    def __call__(self, y_true, y_pred, sample_weight=None):\n",
116 |     "        \"\"\"\n",
117 |     "        We also update __call__ method to redefine default reduction with square\n",
118 |     "        \"\"\"\n",
119 |     "        return super().__call__(y_true, y_pred, sample_weight) ** .5\n",
120 |     "\n",
121 |     "\n",
122 |     "class CustomMSLELoss(Loss):\n",
123 |     "    \"\"\"Custom MSLE Implementation\"\"\"\n",
124 |     "    \n",
125 |     "    def preprocess_input(self, y_true):\n",
126 |     "        \"\"\"\n",
127 |     "        This method defines, how raw target should be processed before the train starts\n",
128 |     "        We expect y_true has shape (n_samples, n_outputs)\n",
129 |     "        \n",
130 |     "        Here we will not do the actual preprocess, but just check if targets are non negative\n",
131 |     "        \n",
132 |     "        At that stage y_true is already in GPU memory, so we use CuPy to handle it.\n",
133 |     "        Usage is the same as NumPy, just replace np with cp\n",
134 |     "        \n",
135 |     "        Note: All metrics and losses will be computed with this preprocess target\n",
136 |     "        \"\"\"\n",
137 |     "        assert (y_true >= 0).all()\n",
138 |     "        return y_true\n",
139 |     "    \n",
140 |     "    def postprocess_output(self, y_pred):\n",
141 |     "        \"\"\"\n",
142 |     "        Since we modify the target variable, we also need method, that defines \n",
143 |     "        how to process model prediction\n",
144 |     "        \"\"\"\n",
145 |     "        \n",
146 |     "        return cp.expm1(y_pred)\n",
147 |     "    \n",
148 |     "    def get_grad_hess(self, y_true, y_pred):\n",
149 |     "        \"\"\"\n",
150 |     "        This method defines how to calculate gradients and hessians for given loss\n",
151 |     "        Note that training also supports sample_weight, but its applied outside the loss fn,\n",
152 |     "        so we don't need to handle it here\n",
153 |     "        \"\"\" \n",
154 |     "        # grad should have the same shape as y_pred\n",
155 |     "        grad = y_pred - cp.log1p(y_true)\n",
156 |     "        # NOTE: Input could be a matrix in multioutput case!\n",
157 |     "        # But anyway - hessians are ones for all of them\n",
158 |     "        # So, we just create (n_samples, 1) array of ones \n",
159 |     "        # and after that is will be broadcasted over all outputs\n",
160 |     "        # grad should have the same shape as y_pred or (n_samples, 1)\n",
161 |     "        hess = cp.ones((y_true.shape[0], 1), dtype=cp.float32)\n",
162 |     "        \n",
163 |     "        return grad, hess\n",
164 |     "\n",
165 |     "    def base_score(self, y_true):\n",
166 |     "        \"\"\"\n",
167 |     "        One last thing we require to define is base score\n",
168 |     "        This method defines how to initialize an empty ensemble\n",
169 |     "        In simplies case it could be just an array of zeros\n",
170 |     "        But usualy it is better to boost from mean values\n",
171 |     "        Output shape should be (n_outputs, ) \n",
172 |     "        \n",
173 |     "        Note: y_true is already processed array here\n",
174 |     "        \n",
175 |     "        \"\"\"\n",
176 |     "        return cp.log1p(y_true).mean(axis=0)\n",
177 |     "    \n",
178 |     "    \n"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 4,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "[14:33:29] Stdout logging level is INFO.\n",
191 |       "[14:33:29] GDBT train starts. Max iter 1000, early stopping rounds 100\n",
192 |       "[14:33:30] Iter 0; Sample 0, score = 0.24603539557907483; \n",
193 |       "[14:33:32] Iter 100; Sample 0, score = 0.1742483282481912; \n",
194 |       "[14:33:35] Iter 200; Sample 0, score = 0.1342659688820449; \n",
195 |       "[14:33:38] Iter 300; Sample 0, score = 0.10731344416487074; \n",
196 |       "[14:33:40] Iter 400; Sample 0, score = 0.08782596307881492; \n",
197 |       "[14:33:43] Iter 500; Sample 0, score = 0.07353079220891415; \n",
198 |       "[14:33:46] Iter 600; Sample 0, score = 0.06300246387723472; \n",
199 |       "[14:33:48] Iter 700; Sample 0, score = 0.05525294291245993; \n",
200 |       "[14:33:51] Iter 800; Sample 0, score = 0.049518312947738174; \n",
201 |       "[14:33:53] Iter 900; Sample 0, score = 0.045306569498698365; \n",
202 |       "[14:33:56] Iter 999; Sample 0, score = 0.04221849120683567; \n",
203 |       "CPU times: user 29.4 s, sys: 4.47 s, total: 33.9 s\n",
204 |       "Wall time: 32.2 s\n"
205 |      ]
206 |     },
207 |     {
208 |      "data": {
209 |       "text/plain": [
210 |        "<py_boost.gpu.boosting.GradientBoosting at 0x7f9a2070ccd0>"
211 |       ]
212 |      },
213 |      "execution_count": 4,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "%%time\n",
220 |     "model = GradientBoosting(CustomMSLELoss(), CustomRMSLEMetric(), lr=0.01, verbose=100, ntrees=1000)\n",
221 |     "\n",
222 |     "model.fit(X, y, eval_sets=[{'X': X_test, 'y': y_test},])"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 5,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "data": {
232 |       "text/plain": [
233 |        "(50000, 10)"
234 |       ]
235 |      },
236 |      "execution_count": 5,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "model.predict(X_test).shape"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "### Custom colsample strategy\n",
250 |     "\n",
251 |     "We could also redefine some other things. Let's see the example of creating our bagging strategy. Most of custom things should be done via Callbak. \n",
252 |     "\n",
253 |     "To create callback we should inherit Callbak class. There are 4 methods, that could be redefined:\n",
254 |     "        - before_train - outputs None\n",
255 |     "        - before_iteration - outputs None\n",
256 |     "        - after_train - outputs None\n",
257 |     "        - after_iteration - outputs bool - if training should be stopped after iteration\n",
258 |     "\n",
259 |     "    Methods receive build_info - the state dict, that could be accessed and modifier\n",
260 |     "\n",
261 |     "    Basic build info structure:\n",
262 |     "\n",
263 |     "    build_info = {\n",
264 |     "            'data': {\n",
265 |     "                'train': {\n",
266 |     "                    'features_cpu': np.ndarray - raw feature matrix,\n",
267 |     "                    'features_gpu': cp.ndarray - uint8 quantized feature matrix on GPU,\n",
268 |     "                    'target': y - cp.ndarray - processed target variable on GPU,\n",
269 |     "                    'sample_weight': cp.ndarray - processed sample_weight on GPU or None,\n",
270 |     "                    'ensemble': cp.ndarray - current model prediction (with no postprocessing,\n",
271 |     "                        ex. before sigmoid for logloss) on GPU,\n",
272 |     "                    'grad': cp.ndarray of gradients on GPU, before first iteration - None,\n",
273 |     "                    'hess': cp.ndarray of hessians on GPU, before first iteration - None,\n",
274 |     "\n",
275 |     "                    'last_tree': {\n",
276 |     "                        'leaves': cp.ndarray - nodes indices of the last trained tree,\n",
277 |     "                        'preds': cp.ndarray - predictions of the last trained tree,\n",
278 |     "                    }\n",
279 |     "\n",
280 |     "                },\n",
281 |     "                'valid': {\n",
282 |     "                    'features_cpu' the same as train, but list, each element corresponds each validation sample,\n",
283 |     "                    'features_gpu': ...,\n",
284 |     "                    'target': ...,\n",
285 |     "                    'sample_weight': ...,\n",
286 |     "                    'ensemble': ...,\n",
287 |     "\n",
288 |     "                    'last_tree': {\n",
289 |     "                        'leaves': ...,\n",
290 |     "                        'preds': ...,\n",
291 |     "                    }\n",
292 |     "\n",
293 |     "                }\n",
294 |     "            },\n",
295 |     "            'borders': list of np.ndarray - list or quantization borders,\n",
296 |     "            'model': GradientBoosting - model, that is trained,\n",
297 |     "            'mempool': cp.cuda.MemoryPool - memory pool used for train, could be used to clean memory to prevent OOM,\n",
298 |     "            'builder': DepthwiseTreeBuilder - the instance of tree builder, contains training params,\n",
299 |     "\n",
300 |     "            'num_iter': int, current number of iteration,\n",
301 |     "            'iter_scores': list of float - list of metric values for all validation sets for the last iteration,\n",
302 |     "        }\n"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 6,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "import cupy as cp\n",
312 |     "from py_boost.callbacks.callback import Callback\n",
313 |     "\n",
314 |     "class ColumnImportanceSampler(Callback):\n",
315 |     "    \"\"\"\n",
316 |     "    This class implements a sampling strategy, \n",
317 |     "    that sample columns in proportion to thier importance at each step\n",
318 |     "    \n",
319 |     "    We should implement __call__ method to use it as sampler\n",
320 |     "    \"\"\"\n",
321 |     "    def __init__(self, rate=0.5, smooth=0.1, \n",
322 |     "                 update_freq=10, inverse=False):\n",
323 |     "        \"\"\"\n",
324 |     "        \n",
325 |     "        Args:\n",
326 |     "            rate: float, sampling rate\n",
327 |     "            smooth: float, smoothing parameter\n",
328 |     "            update_freq: int importance update frequency\n",
329 |     "            inverse: inverse the probability of sampling\n",
330 |     "\n",
331 |     "        Returns:\n",
332 |     "\n",
333 |     "        \"\"\"\n",
334 |     "        # Custom columnns sampler based on feature importance\n",
335 |     "        self.rate = rate\n",
336 |     "        self.smooth = smooth\n",
337 |     "        self.update_freq = update_freq\n",
338 |     "        self.inverse = inverse\n",
339 |     "        \n",
340 |     "    def before_iteration(self, build_info):\n",
341 |     "        \"\"\"\n",
342 |     "        Define what should be doe before each iteration\n",
343 |     "        \"\"\"\n",
344 |     "        # Update feature importance\n",
345 |     "        num_iter = build_info['num_iter']\n",
346 |     "        \n",
347 |     "        if (num_iter % self.update_freq) == 0:\n",
348 |     "            # update probabilities with actual importance\n",
349 |     "            p = build_info['model'].get_feature_importance() + 1e-3\n",
350 |     "            p = cp.asarray(p) / (p.sum())\n",
351 |     "            # inverse if needed\n",
352 |     "            if self.inverse:\n",
353 |     "                p = 1 - p\n",
354 |     "                p = p / p.sum()\n",
355 |     "            # apply smoothing\n",
356 |     "            self.p = p * (1 - self.smooth) + cp.ones_like(p) * self.smooth / p.shape[0]\n",
357 |     "            \n",
358 |     "    def __call__(self):\n",
359 |     "        \"\"\"\n",
360 |     "        Method should return the array of indices, that will be used\n",
361 |     "        to grow the tree at the current step\n",
362 |     "        \"\"\"\n",
363 |     "        # Sample rows\n",
364 |     "        n = self.p.shape[0]\n",
365 |     "        index = cp.random.choice(cp.arange(n, dtype=cp.uint64), \n",
366 |     "            size=int(self.rate * n), p=self.p)\n",
367 |     "        \n",
368 |     "        return index"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 7,
374 |    "metadata": {},
375 |    "outputs": [
376 |     {
377 |      "name": "stdout",
378 |      "output_type": "stream",
379 |      "text": [
380 |       "[14:33:57] Stdout logging level is INFO.\n",
381 |       "[14:33:57] GDBT train starts. Max iter 1000, early stopping rounds 100\n",
382 |       "[14:33:57] Iter 0; Sample 0, score = 0.24644921389665553; \n",
383 |       "[14:33:59] Iter 100; Sample 0, score = 0.17590711477798346; \n",
384 |       "[14:34:00] Iter 200; Sample 0, score = 0.13484779001390923; \n",
385 |       "[14:34:02] Iter 300; Sample 0, score = 0.10826939489014992; \n",
386 |       "[14:34:03] Iter 400; Sample 0, score = 0.08943576705705947; \n",
387 |       "[14:34:05] Iter 500; Sample 0, score = 0.0753772653073726; \n",
388 |       "[14:34:07] Iter 600; Sample 0, score = 0.06446689810058637; \n",
389 |       "[14:34:08] Iter 700; Sample 0, score = 0.05593631183289121; \n",
390 |       "[14:34:10] Iter 800; Sample 0, score = 0.04973719737069171; \n",
391 |       "[14:34:12] Iter 900; Sample 0, score = 0.045194617065396514; \n",
392 |       "[14:34:13] Iter 999; Sample 0, score = 0.041950662857688406; \n"
393 |      ]
394 |     },
395 |     {
396 |      "data": {
397 |       "text/plain": [
398 |        "<py_boost.gpu.boosting.GradientBoosting at 0x7f9a2070ca30>"
399 |       ]
400 |      },
401 |      "execution_count": 7,
402 |      "metadata": {},
403 |      "output_type": "execute_result"
404 |     }
405 |    ],
406 |    "source": [
407 |     "# create model with new sampler   \n",
408 |     "# if we pass new sampler to the colsample argument it will used instead of default\n",
409 |     "# it will also be added to the callback pipeline automatically\n",
410 |     "# you should not pass samplers to the callbacks argument\n",
411 |     "\n",
412 |     "model = GradientBoosting(CustomMSLELoss(), CustomRMSLEMetric(), \n",
413 |     "                         colsample=ColumnImportanceSampler(0.5), \n",
414 |     "                         lr=0.01, verbose=100, ntrees=1000 )\n",
415 |     "\n",
416 |     "model.fit(X, y, eval_sets=[{'X': X_test, 'y': y_test},])"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 8,
422 |    "metadata": {},
423 |    "outputs": [
424 |     {
425 |      "data": {
426 |       "text/plain": [
427 |        "(50000, 10)"
428 |       ]
429 |      },
430 |      "execution_count": 8,
431 |      "metadata": {},
432 |      "output_type": "execute_result"
433 |     }
434 |    ],
435 |    "source": [
436 |     "model.predict(X_test).shape"
437 |    ]
438 |   }
439 |  ],
440 |  "metadata": {
441 |   "kernelspec": {
442 |    "display_name": "rapids-22.06",
443 |    "language": "python",
444 |    "name": "rapids-22.06"
445 |   },
446 |   "language_info": {
447 |    "codemirror_mode": {
448 |     "name": "ipython",
449 |     "version": 3
450 |    },
451 |    "file_extension": ".py",
452 |    "mimetype": "text/x-python",
453 |    "name": "python",
454 |    "nbconvert_exporter": "python",
455 |    "pygments_lexer": "ipython3",
456 |    "version": "3.9.13"
457 |   }
458 |  },
459 |  "nbformat": 4,
460 |  "nbformat_minor": 2
461 | }
462 | 


--------------------------------------------------------------------------------
/tutorials/Tutorial_4_Handle_null_targets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## This tutorial shows how to handle NaN targets in multioutput tasks"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Imports"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "# Optional: set the device to run\n",
 25 |     "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
 26 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
 27 |     "\n",
 28 |     "os.makedirs('../data', exist_ok=True)\n",
 29 |     "import numpy as np\n",
 30 |     "import joblib\n",
 31 |     "from sklearn.datasets import make_regression\n",
 32 |     "\n",
 33 |     "# simple case - just one class is used\n",
 34 |     "from py_boost import GradientBoosting\n",
 35 |     "from py_boost.multioutput.sketching import *"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "### Generate dummy multilabel task with NaN values in target\n",
 43 |     "\n",
 44 |     "Some times it happends that some target values in multioutput task are missing. For example, you are solving multilabel task and some labels are unknown for some of the rows, so acually your target could be one of 0/1/NaN. Normaly you can not using ML algorithms directly in that case, so you can do one of the following:\n",
 45 |     "\n",
 46 |     "- Drop NaN rows, but that case you are going to miss some part of the data\n",
 47 |     "- Train binary models separately, but your model will be more complex and probably overfitted\n",
 48 |     "- Fill NaNs with 0 or 1, so your labeling will become wrong\n",
 49 |     "- Use Neural Networks with masked loss function\n",
 50 |     "\n",
 51 |     "In Py-Boost you can write the loss wrapper to handle such scenario and train your model directly on known labels ignoring NaNs, and here is shown how.\n",
 52 |     "\n",
 53 |     "We will create it as the regression task and then thresholding the target. And then add some random NaNs"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "CPU times: user 2.33 s, sys: 1.66 s, total: 3.99 s\n",
 66 |       "Wall time: 876 ms\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "%%time\n",
 72 |     "X, y = make_regression(150000, 100, n_targets=10, random_state=42)\n",
 73 |     "# binarize\n",
 74 |     "y = (y > y.mean(axis=0)).astype(np.float32)\n",
 75 |     "# add some NaNs\n",
 76 |     "y[np.random.rand(150000, 10) > 0.5] = np.nan\n",
 77 |     "\n",
 78 |     "\n",
 79 |     "X_test, y_test = X[:50000], y[:50000]\n",
 80 |     "X, y = X[-50000:], y[-50000:]"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### NaN loss and metric wrappers\n",
 88 |     "\n",
 89 |     "Here it is shown how to write loss wrapper ignoring NaNs"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "import cupy as cp\n",
 99 |     "from py_boost.gpu.losses import BCELoss\n",
100 |     "\n",
101 |     "class BCEWithNaNLoss(BCELoss):\n",
102 |     "    \n",
103 |     "    def base_score(self, y_true):\n",
104 |     "        # Replace .mean with nanmean function to calc base score\n",
105 |     "        means = cp.clip(cp.nanmean(y_true, axis=0), self.clip_value, 1 - self.clip_value)\n",
106 |     "        return cp.log(means / (1 - means))\n",
107 |     "    \n",
108 |     "    def get_grad_hess(self, y_true, y_pred):\n",
109 |     "        # first, get nan mask for y_true\n",
110 |     "        mask = cp.isnan(y_true)\n",
111 |     "        # then, compute loss with any values at nan places just to prevent the exception\n",
112 |     "        grad, hess = super().get_grad_hess(cp.where(mask, 0, y_true), y_pred)\n",
113 |     "        # invert mask\n",
114 |     "        mask = (~mask).astype(cp.float32)\n",
115 |     "        # multiply grad and hess on inverted mask\n",
116 |     "        # now grad and hess eq. 0 on NaN points\n",
117 |     "        # that actually means that prediction on that place should not be updated\n",
118 |     "        grad = grad * mask\n",
119 |     "        hess = hess * mask\n",
120 |     "        \n",
121 |     "        return grad, hess\n"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "And here is column-wise roc-auc metric ignoring NaNs"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 4,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "from py_boost.gpu.losses.metrics import Metric, auc\n",
138 |     "\n",
139 |     "class NaNAucMetric(Metric):\n",
140 |     "    \n",
141 |     "    def __call__(self, y_true, y_pred, sample_weight=None):\n",
142 |     "        \n",
143 |     "        aucs = []\n",
144 |     "        mask = ~cp.isnan(y_true)\n",
145 |     "        \n",
146 |     "        for i in range(y_true.shape[1]):\n",
147 |     "            m = mask[:, i]\n",
148 |     "            w = None if sample_weight is None else sample_weight[:, 0][m]\n",
149 |     "            aucs.append(\n",
150 |     "                auc(y_true[:, i][m], y_pred[:, i][m], w)\n",
151 |     "            )\n",
152 |     "            \n",
153 |     "        return np.mean(aucs)\n",
154 |     "    \n",
155 |     "    def compare(self, v0 ,v1):\n",
156 |     "\n",
157 |     "        return v0 > v1    \n",
158 |     "            \n",
159 |     "            \n",
160 |     "        "
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 5,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "[08:31:38] Stdout logging level is INFO.\n",
173 |       "[08:31:38] GDBT train starts. Max iter 1000, early stopping rounds 200\n",
174 |       "[08:31:39] Iter 0; Sample 0, score = 0.7906884535541213; \n",
175 |       "[08:31:41] Iter 100; Sample 0, score = 0.9687261163054176; \n",
176 |       "[08:31:44] Iter 200; Sample 0, score = 0.9785187659166686; \n",
177 |       "[08:31:46] Iter 300; Sample 0, score = 0.9844858052685057; \n",
178 |       "[08:31:49] Iter 400; Sample 0, score = 0.9883780152591723; \n",
179 |       "[08:31:51] Iter 500; Sample 0, score = 0.9908004122540589; \n",
180 |       "[08:31:54] Iter 600; Sample 0, score = 0.9923353340683694; \n",
181 |       "[08:31:57] Iter 700; Sample 0, score = 0.9935137491384962; \n",
182 |       "[08:31:59] Iter 800; Sample 0, score = 0.9943018456130359; \n",
183 |       "[08:32:02] Iter 900; Sample 0, score = 0.9949417958344802; \n",
184 |       "[08:32:04] Iter 999; Sample 0, score = 0.9954331107999328; \n",
185 |       "CPU times: user 32.1 s, sys: 1.59 s, total: 33.7 s\n",
186 |       "Wall time: 31.9 s\n"
187 |      ]
188 |     },
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "<py_boost.gpu.boosting.GradientBoosting at 0x7f5e559de730>"
193 |       ]
194 |      },
195 |      "execution_count": 5,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "%%time\n",
202 |     "model = GradientBoosting(BCEWithNaNLoss(), NaNAucMetric(), lr=0.01,\n",
203 |     "                         verbose=100, ntrees=1000, es=200, multioutput_sketch=RandomProjectionSketch(1))\n",
204 |     "\n",
205 |     "model.fit(X, y, eval_sets=[{'X': X_test, 'y': y_test},])"
206 |    ]
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "rapids-22.06",
212 |    "language": "python",
213 |    "name": "rapids-22.06"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.9.13"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 2
230 | }
231 | 


--------------------------------------------------------------------------------
/tutorials/Tutorial_5_ONNX_inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "e75f5c6d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## This tutorial shows how to convert your model to ONNX and use for CPU inference"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "8f07b0d0",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "# Optional: set the device to run\n",
 20 |     "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
 21 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
 22 |     "\n",
 23 |     "os.makedirs('../data', exist_ok=True)\n",
 24 |     "\n",
 25 |     "import numpy as np\n",
 26 |     "import joblib\n",
 27 |     "import onnxruntime\n",
 28 |     "\n",
 29 |     "from sklearn.datasets import make_regression\n",
 30 |     "\n",
 31 |     "from py_boost import GradientBoosting\n",
 32 |     "from py_boost import pb_to_onnx, ONNXPredictor"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "id": "3aa67d64",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "### Generate dummy multilabel task and train the model"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "id": "12c1d5ec",
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "[15:37:33] Stdout logging level is INFO.\n",
 54 |       "[15:37:33] GDBT train starts. Max iter 100, early stopping rounds 200\n",
 55 |       "[15:37:34] Iter 0; \n",
 56 |       "[15:37:37] Iter 99; \n",
 57 |       "CPU times: user 15.1 s, sys: 1.85 s, total: 16.9 s\n",
 58 |       "Wall time: 10.1 s\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "%%time\n",
 64 |     "X, y = make_regression(150000, 100, n_targets=5, random_state=42)\n",
 65 |     "# binarize\n",
 66 |     "y = (y > y.mean(axis=0)).astype(np.float32)\n",
 67 |     "\n",
 68 |     "model = GradientBoosting(\n",
 69 |     "    'bce', lr=0.01, verbose=100, \n",
 70 |     "    ntrees=100, es=200, \n",
 71 |     ")\n",
 72 |     "model.fit(X, y)\n",
 73 |     "pp = model.predict(X)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "id": "49202ae7",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### Convert the model to ONNX\n",
 82 |     "\n",
 83 |     "The simpliest way to convert is using `pb_to_onnx` function. Just pass the `py-boost` model and path to store parsed model"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "id": "21e10069",
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stderr",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "100%|██████████| 100/100 [00:00<00:00, 1723.04it/s]\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "pb_to_onnx(model, '../data/pb_model.onnx')"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "id": "8d385bbc",
107 |    "metadata": {},
108 |    "source": [
109 |     "Once the parsing is completed, you can run `onnxruntime` session for inference"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "id": "7d7d4a0c",
116 |    "metadata": {},
117 |    "outputs": [
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "CPU times: user 5.59 s, sys: 131 ms, total: 5.72 s\n",
123 |       "Wall time: 395 ms\n"
124 |      ]
125 |     },
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "array([[0.6264308 , 0.41568166, 0.5388822 , 0.4261355 , 0.57804173],\n",
130 |        "       [0.59586126, 0.42369062, 0.56585   , 0.57584757, 0.5392887 ],\n",
131 |        "       [0.72726965, 0.67056704, 0.49255225, 0.6711969 , 0.635281  ],\n",
132 |        "       ...,\n",
133 |        "       [0.5112887 , 0.38028964, 0.4761739 , 0.52265   , 0.4513791 ],\n",
134 |        "       [0.67362005, 0.54282206, 0.62851644, 0.6090929 , 0.7003519 ],\n",
135 |        "       [0.56341565, 0.52830017, 0.41594115, 0.43341845, 0.42639387]],\n",
136 |        "      dtype=float32)"
137 |       ]
138 |      },
139 |      "execution_count": 4,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "%%time\n",
146 |     "\n",
147 |     "# start session\n",
148 |     "sess = onnxruntime.InferenceSession(\n",
149 |     "    '../data/pb_model.onnx', \n",
150 |     "    providers=[\"CPUExecutionProvider\"]\n",
151 |     ")\n",
152 |     "\n",
153 |     "# run inference\n",
154 |     "preds = sess.run(['Y'], {'X': X.astype(np.float32, copy=False)})[0]\n",
155 |     "preds = 1 / (1 + np.exp(-preds))\n",
156 |     "preds"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 5,
162 |    "id": "50d94691",
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "data": {
167 |       "text/plain": [
168 |        "2.3841858e-07"
169 |       ]
170 |      },
171 |      "execution_count": 5,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "np.abs(preds - pp).max()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "id": "37a4c052",
183 |    "metadata": {},
184 |    "source": [
185 |     "***Note*** : by default, parser only collect the trees and base score info. So, it knows nothing about the postprocessing function, for example `sigmoid` in this case. That's why we apply sigmoid after inference part. But we can pass one of built-in `ONNX` post transforms: 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', or 'PROBIT' to avoid this step. Probably it is going to be more efficient:"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 6,
191 |    "id": "9d32635e",
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "name": "stderr",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "100%|██████████| 100/100 [00:00<00:00, 1670.84it/s]\n"
199 |      ]
200 |     },
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "CPU times: user 5.58 s, sys: 178 ms, total: 5.76 s\n",
206 |       "Wall time: 583 ms\n"
207 |      ]
208 |     },
209 |     {
210 |      "data": {
211 |       "text/plain": [
212 |        "array([[0.62643087, 0.41568172, 0.5388822 , 0.42613554, 0.57804173],\n",
213 |        "       [0.5958613 , 0.42369062, 0.56584996, 0.57584757, 0.5392887 ],\n",
214 |        "       [0.72726965, 0.67056704, 0.49255228, 0.6711969 , 0.6352811 ],\n",
215 |        "       ...,\n",
216 |        "       [0.5112887 , 0.3802896 , 0.47617394, 0.52265   , 0.45137918],\n",
217 |        "       [0.67362005, 0.54282206, 0.6285165 , 0.6090929 , 0.7003519 ],\n",
218 |        "       [0.56341565, 0.5283001 , 0.41594112, 0.43341845, 0.42639393]],\n",
219 |        "      dtype=float32)"
220 |       ]
221 |      },
222 |      "execution_count": 6,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "%%time\n",
229 |     "pb_to_onnx(model, '../data/pb_model.onnx', post_transform='LOGISTIC') # pass built-in post transform\n",
230 |     "\n",
231 |     "# start session\n",
232 |     "sess = onnxruntime.InferenceSession(\n",
233 |     "    '../data/pb_model.onnx', \n",
234 |     "    providers=[\"CPUExecutionProvider\"]\n",
235 |     ")\n",
236 |     "\n",
237 |     "# run inference\n",
238 |     "preds = sess.run(['Y'], {'X': X.astype(np.float32, copy=False)})[0]\n",
239 |     "preds"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 7,
245 |    "id": "6efcaaa3",
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/plain": [
251 |        "2.3841858e-07"
252 |       ]
253 |      },
254 |      "execution_count": 7,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "np.abs(preds - pp).max()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "id": "bdcc1486",
266 |    "metadata": {},
267 |    "source": [
268 |     "***Filter outputs*** . Another option is to convert just a part of outputs to `ONNX`, for the case when we need only particular outputs for inference. For example, we want to keep only 0 and 2 outputs for inference and we don't want to compute the parts of model that doesn't affect the result:"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 8,
274 |    "id": "080f1139",
275 |    "metadata": {},
276 |    "outputs": [
277 |     {
278 |      "name": "stderr",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "100%|██████████| 100/100 [00:00<00:00, 2039.98it/s]\n"
282 |      ]
283 |     },
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "CPU times: user 5.31 s, sys: 178 ms, total: 5.48 s\n",
289 |       "Wall time: 528 ms\n"
290 |      ]
291 |     },
292 |     {
293 |      "data": {
294 |       "text/plain": [
295 |        "array([[0.62643087, 0.5388822 ],\n",
296 |        "       [0.5958613 , 0.56584996],\n",
297 |        "       [0.72726965, 0.49255228],\n",
298 |        "       ...,\n",
299 |        "       [0.5112887 , 0.47617394],\n",
300 |        "       [0.67362005, 0.6285165 ],\n",
301 |        "       [0.56341565, 0.41594112]], dtype=float32)"
302 |       ]
303 |      },
304 |      "execution_count": 8,
305 |      "metadata": {},
306 |      "output_type": "execute_result"
307 |     }
308 |    ],
309 |    "source": [
310 |     "%%time\n",
311 |     "pb_to_onnx(model, '../data/pb_model.onnx', fltr=[0, 2], post_transform='LOGISTIC') # pass array to filter outputs\n",
312 |     "\n",
313 |     "# start session\n",
314 |     "sess = onnxruntime.InferenceSession(\n",
315 |     "    '../data/pb_model.onnx', \n",
316 |     "    providers=[\"CPUExecutionProvider\"]\n",
317 |     ")\n",
318 |     "\n",
319 |     "# run inference\n",
320 |     "preds = sess.run(['Y'], {'X': X.astype(np.float32, copy=False)})[0]\n",
321 |     "preds"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 9,
327 |    "id": "cbf704d7",
328 |    "metadata": {},
329 |    "outputs": [
330 |     {
331 |      "data": {
332 |       "text/plain": [
333 |        "1.937151e-07"
334 |       ]
335 |      },
336 |      "execution_count": 9,
337 |      "metadata": {},
338 |      "output_type": "execute_result"
339 |     }
340 |    ],
341 |    "source": [
342 |     "np.abs(preds - pp[:, [0, 2]]).max()"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "id": "735bb0d8",
348 |    "metadata": {},
349 |    "source": [
350 |     "### Built-in wrapper\n",
351 |     "\n",
352 |     "As an alternative you can use wrapper that hide all the manipulations with `ONNX` and let you just call fit and predict. You can build wrapper from model:"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 10,
358 |    "id": "5e5b46e7",
359 |    "metadata": {},
360 |    "outputs": [
361 |     {
362 |      "name": "stderr",
363 |      "output_type": "stream",
364 |      "text": [
365 |       "100%|██████████| 100/100 [00:00<00:00, 1909.37it/s]\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "onnx_predictor = ONNXPredictor(\n",
371 |     "    model, '../data/pb_model.onnx', \n",
372 |     "    fltr=[0, 2], \n",
373 |     ")"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 11,
379 |    "id": "c506b652",
380 |    "metadata": {},
381 |    "outputs": [
382 |     {
383 |      "name": "stdout",
384 |      "output_type": "stream",
385 |      "text": [
386 |       "CPU times: user 4.71 s, sys: 156 ms, total: 4.86 s\n",
387 |       "Wall time: 328 ms\n"
388 |      ]
389 |     },
390 |     {
391 |      "data": {
392 |       "text/plain": [
393 |        "array([[0.6264308 , 0.5388822 ],\n",
394 |        "       [0.59586126, 0.56585   ],\n",
395 |        "       [0.72726965, 0.49255225],\n",
396 |        "       ...,\n",
397 |        "       [0.5112887 , 0.4761739 ],\n",
398 |        "       [0.67362005, 0.62851644],\n",
399 |        "       [0.56341565, 0.41594115]], dtype=float32)"
400 |       ]
401 |      },
402 |      "execution_count": 11,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "%%time\n",
409 |     "preds = onnx_predictor.predict(X)\n",
410 |     "preds"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 12,
416 |    "id": "13f372d7",
417 |    "metadata": {},
418 |    "outputs": [
419 |     {
420 |      "data": {
421 |       "text/plain": [
422 |        "1.7881393e-07"
423 |       ]
424 |      },
425 |      "execution_count": 12,
426 |      "metadata": {},
427 |      "output_type": "execute_result"
428 |     }
429 |    ],
430 |    "source": [
431 |     "np.abs(preds - pp[:, [0, 2]]).max()"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "id": "2e6ddf9f",
437 |    "metadata": {},
438 |    "source": [
439 |     "***Note*** : You can not save `ONNXPredictor` object, since `onnxruntime.InferenceSession` is not pickable. Instead, to use it in the other session, you can restore it from `ONNX` model file. But note that in this case you will loose the information about postprocessing function, if it was not provided as `post_transform` to `ONNXPredictor`.\n",
440 |     "\n",
441 |     "First option, provide the post_transform to `ONNXPredictor`:"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 13,
447 |    "id": "3bb457f2",
448 |    "metadata": {},
449 |    "outputs": [
450 |     {
451 |      "name": "stderr",
452 |      "output_type": "stream",
453 |      "text": [
454 |       "100%|██████████| 100/100 [00:00<00:00, 2116.98it/s]\n"
455 |      ]
456 |     },
457 |     {
458 |      "data": {
459 |       "text/plain": [
460 |        "1.937151e-07"
461 |       ]
462 |      },
463 |      "execution_count": 13,
464 |      "metadata": {},
465 |      "output_type": "execute_result"
466 |     }
467 |    ],
468 |    "source": [
469 |     "# build the predictor and save parsed as ../data/pb_model.onnx\n",
470 |     "onnx_predictor = ONNXPredictor(\n",
471 |     "    model, '../data/pb_model.onnx', \n",
472 |     "    fltr=[0, 2], \n",
473 |     "    post_transform='LOGISTIC' # provide the ONNX post_transform manually\n",
474 |     ")\n",
475 |     "\n",
476 |     "# create new instance from ../data/pb_model.onnx\n",
477 |     "onnx_predictor = ONNXPredictor.from_onnx('../data/pb_model.onnx')\n",
478 |     "preds = onnx_predictor.predict(X)\n",
479 |     "np.abs(preds - pp[:, [0, 2]]).max()"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "id": "f1aef69c",
485 |    "metadata": {},
486 |    "source": [
487 |     "Second, is to provide the python postprocessing function in the new session:"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": 14,
493 |    "id": "88d0e7eb",
494 |    "metadata": {},
495 |    "outputs": [
496 |     {
497 |      "name": "stderr",
498 |      "output_type": "stream",
499 |      "text": [
500 |       "100%|██████████| 100/100 [00:00<00:00, 2232.89it/s]\n"
501 |      ]
502 |     },
503 |     {
504 |      "data": {
505 |       "text/plain": [
506 |        "1.7881393e-07"
507 |       ]
508 |      },
509 |      "execution_count": 14,
510 |      "metadata": {},
511 |      "output_type": "execute_result"
512 |     }
513 |    ],
514 |    "source": [
515 |     "# build the predictor and save parsed as ../data/pb_model.onnx\n",
516 |     "onnx_predictor = ONNXPredictor(\n",
517 |     "    model, '../data/pb_model.onnx', \n",
518 |     "    fltr=[0, 2], \n",
519 |     ")\n",
520 |     "\n",
521 |     "# create new instance from ../data/pb_model.onnx\n",
522 |     "onnx_predictor = ONNXPredictor.from_onnx(\n",
523 |     "    '../data/pb_model.onnx', \n",
524 |     "    postprocess_fn=lambda x: 1 / (1 + np.exp(-x)) # provide py-boost postprocess_fn manually\n",
525 |     ")\n",
526 |     "preds = onnx_predictor.predict(X)\n",
527 |     "np.abs(preds - pp[:, [0, 2]]).max()"
528 |    ]
529 |   }
530 |  ],
531 |  "metadata": {
532 |   "kernelspec": {
533 |    "display_name": "rapids-env",
534 |    "language": "python",
535 |    "name": "rapids-env"
536 |   },
537 |   "language_info": {
538 |    "codemirror_mode": {
539 |     "name": "ipython",
540 |     "version": 3
541 |    },
542 |    "file_extension": ".py",
543 |    "mimetype": "text/x-python",
544 |    "name": "python",
545 |    "nbconvert_exporter": "python",
546 |    "pygments_lexer": "ipython3",
547 |    "version": "3.10.14"
548 |   }
549 |  },
550 |  "nbformat": 4,
551 |  "nbformat_minor": 5
552 | }
553 | 


--------------------------------------------------------------------------------