├── .gitignore ├── LICENSE ├── README.md ├── build_package.sh ├── py_boost ├── __init__.py ├── callbacks │ ├── __init__.py │ └── callback.py ├── cv │ ├── __init__.py │ ├── adaptive_es.py │ ├── base.py │ └── cluster_tree.py ├── gpu │ ├── __init__.py │ ├── base.py │ ├── boosting.py │ ├── losses │ │ ├── __init__.py │ │ ├── losses.py │ │ ├── metrics.py │ │ └── multiclass_metrics.py │ ├── serialization.py │ ├── sketch_boost.py │ ├── tree.py │ └── utils.py ├── multioutput │ ├── __init__.py │ ├── sketching.py │ └── target_splitter.py ├── quantization │ ├── __init__.py │ ├── base.py │ └── utils.py ├── sampling │ ├── __init__.py │ └── bagging.py └── utils │ ├── __init__.py │ ├── logging.py │ ├── onnx_wrapper.py │ └── tl_wrapper.py ├── pyproject.toml └── tutorials ├── Tutorial_1_Basics.ipynb ├── Tutorial_2_Advanced_multioutput.ipynb ├── Tutorial_3_Custom_features.ipynb ├── Tutorial_4_Handle_null_targets.ipynb └── Tutorial_5_ONNX_inference.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | multioutpout_benchmark/ 4 | tutorials/Experiment.ipynb 5 | data/ 6 | catboost_info/ 7 | py_boost_venv 8 | publish_package.sh 9 | reinstall.sh 10 | poetry.lock 11 | .idea 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 107 | __pypackages__/ 108 | 109 | # Celery stuff 110 | celerybeat-schedule 111 | celerybeat.pid 112 | 113 | # SageMath parsed files 114 | *.sage.py 115 | 116 | # Environments 117 | .env 118 | .venv 119 | env/ 120 | venv/ 121 | ENV/ 122 | env.bak/ 123 | venv.bak/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | 143 | # pytype static type analyzer 144 | .pytype/ 145 | 146 | # Cython debug symbols 147 | cython_debug/ 148 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Vakhrusev Anton, Iosipoi Leonid 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Py-boost: a research tool for exploring GBDTs 2 | 3 | Modern gradient boosting toolkits are very complex and are written in low-level programming languages. As a result, 4 | 5 | * It is hard to customize them to suit one’s needs 6 | * New ideas and methods are not easy to implement 7 | * It is difficult to understand how they work 8 | 9 | Py-boost is a Python-based gradient boosting library which aims at overcoming the aforementioned problems. 10 | 11 | **Authors**: [Anton Vakhrushev](https://kaggle.com/btbpanda), [Leonid Iosipoi](http://iosipoi.com/) 12 | , [Sergey Kupriyanov](https://www.linkedin.com/in/sergeykupriyanov). 13 | 14 | ## Py-boost Key Features 15 | 16 | **Simple**. Py-boost is a simplified gradient boosting library, but it supports all main features and hyperparameters 17 | available in other implementations. 18 | 19 | **Fast with GPU**. Despite the fact that Py-boost is written in Python, it works only on GPU and uses Python GPU 20 | libraries such as `CuPy` and `Numba`. 21 | 22 | **Efficient inference**. Since v0.4 Py-Boost is able to perform the efficient inference of tree ensembles on GPU. 23 | Moreover, ones your model is trained on GPU, it could be converted to perform the inference on CPU only machine via 24 | converting to the [treelite](https://treelite.readthedocs.io/) format with build-in wrapper (limitation - model should 25 | be trained with `target_splitter='Single'`, which is the default). 26 | 27 | **ONNX compatible** Since v0.5 Py-Boost is compatible with ONNX format that allows more options the CPU inference and 28 | model deployment. 29 | 30 | **Easy to customize**. Py-boost can be easily customized even if one is not familiar with GPU programming (just replace 31 | np with cp). What can be customized? Almost everything via custom callbacks. Examples: Row/Col sampling strategy, 32 | Training control, Losses/metrics, Multioutput handling strategy, Anything via custom callbacks 33 | 34 | ## SketchBoost [paper](https://openreview.net/forum?id=WSxarC8t-T) 35 | 36 | **Multioutput training**. Current state-of-atr boosting toolkits provide very limited support of multioutput training. 37 | And even if this option is available, training time for such tasks as multiclass/multilabel classification and multitask 38 | regression is quite slow because of the training complexity that scales linearly with the number of outputs. To overcome 39 | the existing limitations we create **SketchBoost** algorithm that uses approximate tree structure search. As we show 40 | in [paper](https://openreview.net/forum?id=WSxarC8t-T) that strategy at least does not lead to performance decrease and 41 | often is able to improve the accuracy 42 | 43 | **SketchBoost**. You can try our sketching strategies by using `SketchBoost` class or if you want you can implement your 44 | own and pass to the `GradientBoosting` constructor as `multioutput_sketch` parameter. For the details please 45 | see [Tutorial_2_Advanced_multioutput](https://github.com/AILab-MLTools/Py-Boost/blob/master/tutorials/Tutorial_2_Advanced_multioutput.ipynb) 46 | 47 | ## Installation 48 | 49 | Before installing py-boost via pip you should have cupy installed. You can use: 50 | 51 | `pip install -U cupy-cuda110 py-boost` 52 | 53 | **Note**: replace with your cuda version! For the details see [this guide](https://docs.cupy.dev/en/stable/install.html) 54 | 55 | ## Quick tour 56 | 57 | Py-boost is easy to use since it has similar to scikit-learn interface. For usage example please see: 58 | 59 | * [Tutorial_1_Basics](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_1_Basics.ipynb) for simple 60 | usage examples 61 | * [Tutorial_2_Advanced_multioutput](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_2_Advanced_multioutput.ipynb) 62 | for advanced multioutput features 63 | * [Tutorial_3_Custom_features](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_3_Custom_features.ipynb) 64 | for examples of customization 65 | * [Tutorial_4_Handle_null_targets](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_4_Handle_null_targets.ipynb) 66 | for the case when multioutput target contains NaNs 67 | * [Tutorial_5_ONNX_inference](https://github.com/sb-ai-lab/Py-Boost/blob/master/tutorials/Tutorial_5_ONNX_inference.ipynb) 68 | examples of parsing and inference on CPU with ONNX 69 | 70 | More examples are coming soon 71 | -------------------------------------------------------------------------------- /build_package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf py_boost_venv 4 | python -m venv py_boost_venv 5 | source ./py_boost_venv/bin/activate 6 | 7 | pip install -U pip 8 | pip install -U poetry 9 | pip install -U cupy-cuda110 10 | 11 | poetry lock 12 | poetry install 13 | poetry build 14 | -------------------------------------------------------------------------------- /py_boost/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import sys 4 | import warnings 5 | 6 | _root_logger = logging.getLogger() 7 | _logger = logging.getLogger(__name__) 8 | _logger.setLevel(logging.WARNING) 9 | 10 | # if root logger has handlers, propagate messages up and let root logger process them 11 | if not _root_logger.hasHandlers(): 12 | _logger.addHandler(logging.StreamHandler(sys.stdout)) 13 | _logger.propagate = False 14 | 15 | try: 16 | subprocess.check_output('nvidia-smi') 17 | CUDA_FOUND = True 18 | except Exception: 19 | CUDA_FOUND = False 20 | 21 | from .utils.tl_wrapper import TLPredictor, TLCompiledPredictor 22 | from .utils.onnx_wrapper import pb_to_onnx, ONNXPredictor 23 | 24 | if CUDA_FOUND: 25 | from .gpu.boosting import GradientBoosting 26 | from .gpu.sketch_boost import SketchBoost 27 | from .gpu.losses.losses import Loss 28 | from .gpu.losses.metrics import Metric 29 | from .callbacks.callback import Callback 30 | 31 | __all__ = [ 32 | 33 | 'GradientBoosting', 34 | 'SketchBoost', 35 | 'Callback', 36 | 'Loss', 37 | 'Metric', 38 | 'callbacks', 39 | 'gpu', 40 | 'multioutput', 41 | 'sampling', 42 | 'utils', 43 | 'pb_to_onnx', 44 | 45 | ] 46 | 47 | else: 48 | warnings.warn('No Nvidia GPU detected! Only treelite inference on CPU is available') 49 | __all__ = [] 50 | 51 | __all__.extend([ 52 | 53 | 'TLPredictor', 54 | 'TLCompiledPredictor', 55 | 'ONNXPredictor' 56 | 57 | ]) 58 | 59 | try: 60 | import importlib.metadata as importlib_metadata 61 | except ModuleNotFoundError: 62 | import importlib_metadata 63 | 64 | __version__ = importlib_metadata.version(__name__) 65 | -------------------------------------------------------------------------------- /py_boost/callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sb-ai-lab/Py-Boost/1bb23905a90793dbf8bb6e50b9bc4a26b1f556c4/py_boost/callbacks/__init__.py -------------------------------------------------------------------------------- /py_boost/callbacks/callback.py: -------------------------------------------------------------------------------- 1 | """Default callbacks""" 2 | import logging 3 | from ..utils.logging import verbosity_to_loglevel, set_stdout_level 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Callback: 9 | """Abstract class for callback. All Callback methods define the actions, should be done between training stages 10 | There are 4 methods, that could be redefined: 11 | - before_train - outputs None 12 | - before_iteration - outputs None 13 | - after_train - outputs None 14 | - after_iteration - outputs bool - if training should be stopped after iteration 15 | 16 | Methods received build_info - the state dict, that could be accessed and modifier 17 | 18 | Basic build info structure: 19 | 20 | build_info = { 21 | 'data': { 22 | 'train': { 23 | 'features_cpu': np.ndarray - raw feature matrix, 24 | 'features_gpu': cp.ndarray - uint8 quantized feature matrix on GPU, 25 | 'target': y - cp.ndarray - processed target variable on GPU, 26 | 'sample_weight': cp.ndarray - processed sample_weight on GPU or None, 27 | 'ensemble': cp.ndarray - current model prediction (with no postprocessing, 28 | ex. before sigmoid for logloss) on GPU, 29 | 'grad': cp.ndarray of gradients on GPU, before first iteration - None, 30 | 'hess': cp.ndarray of hessians on GPU, before first iteration - None, 31 | 32 | 'last_tree': { 33 | 'nodes': cp.ndarray - nodes indices of the last trained tree, 34 | 'preds': cp.ndarray - predictions of the last trained tree, 35 | } 36 | 37 | }, 38 | 'valid': { 39 | 'features_cpu' the same as train, but list, each element corresponds each validation sample, 40 | 'features_gpu': ..., 41 | 'target': ..., 42 | 'sample_weight': ..., 43 | 'ensemble': ..., 44 | 45 | 'last_tree': { 46 | 'nodes': ..., 47 | 'preds': ..., 48 | } 49 | 50 | } 51 | }, 52 | 'borders': list of np.ndarray - list or quantization borders, 53 | 'model': GradientBoosting - model, that is trained, 54 | 'mempool': cp.cuda.MemoryPool - memory pool used for train, could be used to clean memory to prevent OOM, 55 | 'builder': DepthwiseTreeBuilder - the instance of tree builder, contains training params, 56 | 57 | 'num_iter': int, current number of iteration, 58 | 'iter_scores': list of float - list of metric values for all validation sets for the last iteration, 59 | } 60 | 61 | """ 62 | 63 | def before_train(self, build_info): 64 | """Actions to be made before train starts 65 | 66 | Args: 67 | build_info: dict 68 | 69 | Returns: 70 | 71 | """ 72 | return 73 | 74 | def before_iteration(self, build_info): 75 | """Actions to be made before each iteration starts 76 | 77 | Args: 78 | build_info: dict 79 | 80 | Returns: 81 | 82 | """ 83 | return 84 | 85 | def after_iteration(self, build_info): 86 | """Actions to be made after each iteration finishes 87 | 88 | Args: 89 | build_info: dict 90 | 91 | Returns: 92 | bool, if train process should be terminated 93 | """ 94 | return False 95 | 96 | def after_train(self, build_info): 97 | """Actions to be made before train finishes 98 | 99 | Args: 100 | build_info: 101 | 102 | Returns: 103 | 104 | """ 105 | return 106 | 107 | 108 | class CallbackPipeline: 109 | """Sequential pipeline of callbacks""" 110 | 111 | def __init__(self, *callbacks): 112 | self.callbacks = callbacks 113 | 114 | def after_iteration(self, build_info): 115 | stop = False 116 | 117 | for callback in self.callbacks: 118 | stop = stop or callback.after_iteration(build_info) 119 | 120 | return stop 121 | 122 | def after_train(self, build_info): 123 | 124 | for callback in self.callbacks: 125 | callback.after_train(build_info) 126 | 127 | def before_train(self, build_info): 128 | 129 | for callback in self.callbacks: 130 | callback.before_train(build_info) 131 | 132 | def before_iteration(self, build_info): 133 | 134 | for callback in self.callbacks: 135 | callback.before_iteration(build_info) 136 | 137 | 138 | class EvalHistory(Callback): 139 | """Callback for history evaluation""" 140 | 141 | def __init__(self, history, verbose=0): 142 | 143 | self.history = history 144 | self.verbose = verbose 145 | self.metric = None 146 | self.postprocess_fn = None 147 | self.ntrees = None 148 | 149 | def before_train(self, build_info): 150 | """Init params and logger 151 | 152 | Args: 153 | build_info: dict 154 | 155 | Returns: 156 | 157 | """ 158 | self.metric = build_info['model'].metric 159 | self.postprocess_fn = build_info['model'].loss.postprocess_output 160 | self.ntrees = build_info['model'].ntrees 161 | 162 | self.set_verbosity_level(int(self.verbose > 0) * 1) 163 | 164 | msg = 'GDBT train starts. Max iter {0}, early stopping rounds {1}'.format( 165 | build_info['model'].ntrees, build_info['model'].es) 166 | 167 | logger.info(msg) 168 | 169 | def after_iteration(self, build_info): 170 | """Save the iteration results and output log 171 | 172 | Args: 173 | build_info: dict 174 | 175 | Returns: 176 | 177 | """ 178 | valid = build_info['data']['valid'] 179 | y_val, val_ens, w_val = valid['target'], valid['ensemble'], valid['sample_weight'] 180 | 181 | num_iter = build_info['num_iter'] 182 | 183 | msg = 'Iter {0}; '.format(num_iter) 184 | 185 | if self.metric is None: 186 | return 187 | 188 | alias = self.metric.alias 189 | 190 | if len(y_val) > 0: 191 | val_metric = [float(self.metric(y, self.postprocess_fn(x), w)) for (y, x, w) in zip(y_val, val_ens, w_val)] 192 | self.history.append(val_metric) 193 | 194 | msg += ' '.join(['Sample {0}, {1} = {2}; '.format(n, alias, x) for (n, x) in enumerate(val_metric)]) 195 | 196 | build_info['iter_score'] = val_metric 197 | 198 | if ((num_iter % self.verbose) == 0) or (num_iter == (self.ntrees - 1)): 199 | logger.info(msg) 200 | 201 | @staticmethod 202 | def set_verbosity_level(verbose): 203 | """Verbosity level setter. 204 | 205 | Args: 206 | verbose: Controls the verbosity: the higher, the more messages. 207 | <1 : messages are not displayed; 208 | >=1 : the computation process for layers is displayed; 209 | >=2 : the information about folds processing is also displayed; 210 | >=3 : the hyperparameters optimization process is also displayed; 211 | >=4 : the training process for every algorithm is displayed; 212 | """ 213 | level = verbosity_to_loglevel(verbose) 214 | set_stdout_level(level) 215 | 216 | logger.info(f"Stdout logging level is {logging._levelToName[level]}.") 217 | 218 | 219 | class EarlyStopping(Callback): 220 | """Callback for early stopping""" 221 | 222 | def __init__(self, num_rounds=100): 223 | 224 | self.num_rounds = num_rounds 225 | self.best_round = 1 226 | self.no_upd_rounds = 0 227 | self.best_score = None 228 | self.metric = None 229 | 230 | def before_train(self, build_info): 231 | """Init params 232 | 233 | Args: 234 | build_info: dict 235 | 236 | Returns: 237 | 238 | """ 239 | self.metric = build_info['model'].metric 240 | 241 | def after_iteration(self, build_info): 242 | """Check early stopping condition and update the state 243 | 244 | Args: 245 | build_info: dict 246 | 247 | Returns: 248 | bool, if early stopping condition was met 249 | """ 250 | if ('iter_score' not in build_info) or (self.num_rounds == 0): 251 | return False 252 | 253 | num_iter = build_info['num_iter'] 254 | # if multiple valid sets passed - use the last one 255 | score = build_info['iter_score'][-1] 256 | 257 | if num_iter == 0: 258 | self.best_score = score 259 | return False 260 | 261 | if self.metric.compare(score, self.best_score): 262 | self.best_score = score 263 | self.best_round = num_iter + 1 264 | self.no_upd_rounds = 0 265 | return False 266 | 267 | self.no_upd_rounds += 1 268 | 269 | stop = self.no_upd_rounds >= self.num_rounds 270 | 271 | if stop: 272 | msg = 'Early stopping at iter {0}, best iter {1}, best_score {2}'.format( 273 | num_iter + 1, self.best_round, self.best_score) 274 | logger.info(msg) 275 | 276 | return stop 277 | 278 | def after_train(self, build_info): 279 | """Prune the model to the best iteration 280 | 281 | Args: 282 | build_info: dict 283 | 284 | Returns: 285 | 286 | """ 287 | if self.best_score is not None: 288 | model = build_info['model'] 289 | model.models = model.models[:self.best_round] 290 | model.best_round = self.best_round 291 | -------------------------------------------------------------------------------- /py_boost/cv/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools for cross validation""" 2 | 3 | from .base import CrossValidation 4 | from .adaptive_es import AdaptiveESCV 5 | from .cluster_tree import ClusterCandidates 6 | 7 | __all__ = [ 8 | 9 | 'CrossValidation', 10 | 'AdaptiveESCV', 11 | 'ClusterCandidates' 12 | 13 | ] 14 | -------------------------------------------------------------------------------- /py_boost/cv/adaptive_es.py: -------------------------------------------------------------------------------- 1 | """Adaptive early stopping""" 2 | 3 | import numpy as np 4 | try: 5 | import cupy as cp 6 | except Exception: 7 | pass 8 | from copy import deepcopy 9 | from numba import njit 10 | 11 | from ..gpu.losses import MSELoss, CrossEntropyLoss, BCELoss, loss_alias 12 | from ..gpu.utils import validate_input 13 | 14 | from .base import CrossValidation 15 | 16 | 17 | def check_input(y_true, sample_weight): 18 | if len(y_true.shape) == 1: 19 | y_true = y_true[:, np.newaxis] 20 | 21 | y_true = y_true[np.newaxis, :, :] 22 | 23 | if sample_weight is not None and len(sample_weight.shape) == 1: 24 | sample_weight = sample_weight[:, np.newaxis] 25 | 26 | return y_true, sample_weight 27 | 28 | 29 | def bce_scorer(y_true, y_pred, sample_weight=None): 30 | """ 31 | 32 | Args: 33 | y_true: (nobj, nout) 34 | y_pred: (niter, nobj, nout) 35 | sample_weight: (nobj, 1) 36 | 37 | Returns: 38 | 39 | """ 40 | y_true, sample_weight = check_input(y_true, sample_weight) 41 | 42 | path = -np.log(y_true * y_pred + (1 - y_true) * (1 - y_pred)) 43 | path = path.sum(axis=-1).T 44 | 45 | if sample_weight is not None: 46 | path *= sample_weight 47 | 48 | return path 49 | 50 | 51 | def mse_scorer(y_true, y_pred, sample_weight=None): 52 | """ 53 | 54 | Args: 55 | y_true: (nobj, nout) 56 | y_pred: (niter, nobj, nout) 57 | sample_weight: (nobj, 1) 58 | 59 | Returns: 60 | 61 | """ 62 | y_true, sample_weight = check_input(y_true, sample_weight) 63 | 64 | path = (y_true - y_pred) ** 2 65 | path = path.sum(axis=-1).T 66 | 67 | if sample_weight is not None: 68 | path *= sample_weight 69 | 70 | return path 71 | 72 | 73 | def cent_scorer(y_true, y_pred, sample_weight=None): 74 | """ 75 | 76 | Args: 77 | y_true: (nobj, nout) 78 | y_pred: (niter, nobj, nout) 79 | sample_weight: (nobj, 1) 80 | 81 | Returns: 82 | 83 | """ 84 | y_true, sample_weight = check_input(y_true, sample_weight) 85 | 86 | path = -np.log(np.take_along_axis(y_pred, y_true, axis=2)[..., 0].T) 87 | 88 | if sample_weight is not None: 89 | path *= sample_weight 90 | 91 | return path 92 | 93 | 94 | @njit 95 | def select_preds(arr, leaves, order): 96 | """Select corresponding to cluster prediction 97 | 98 | Args: 99 | arr: np.ndarray, predictions 100 | leaves: np.ndarray, clusters 101 | order: np.ndarray, maps cluster label with position in prediction array 102 | 103 | Returns: 104 | np.ndarray, pruned prediction 105 | """ 106 | res = np.empty(arr.shape[1:], dtype=arr.dtype) 107 | 108 | for i in range(leaves.shape[0]): 109 | res[i] = arr[order[leaves[i]], i, :] 110 | 111 | return res 112 | 113 | 114 | class AdaptiveESCV(CrossValidation): 115 | """ 116 | Cross validation wrapper with built-in adaptive early stopping 117 | """ 118 | 119 | def __init__(self, base_learner, cluster, iters_to_fit, metric=None, random_state=42, batch_size=10000): 120 | super().__init__(deepcopy(base_learner), random_state) 121 | self._base_learner.params['es'] = 0 122 | self.cluster = cluster 123 | self.iters_to_fit = iters_to_fit 124 | self.metric = metric 125 | self.batch_size = batch_size 126 | 127 | self.best_split = None 128 | self.best_trees = None 129 | self.best_oof_trees = None 130 | 131 | def get_es_metric(self): 132 | 133 | if self.metric: 134 | return self.metric 135 | 136 | loss = self._base_learner.params['loss'] 137 | if type(loss) is str: 138 | loss = loss_alias[loss] 139 | 140 | if type(loss) is MSELoss: 141 | return mse_scorer 142 | 143 | if type(loss) is BCELoss: 144 | return bce_scorer 145 | 146 | if type(loss) is CrossEntropyLoss: 147 | return cent_scorer 148 | 149 | raise ValueError('Unknown loss func. Please specify metric manually') 150 | 151 | def fit_predict(self, X, y, sample_weight=None, cv=5, stratify=False, random_state=42): 152 | """ 153 | 154 | Args: 155 | X: 156 | y: 157 | sample_weight: 158 | cv: 159 | stratify: 160 | random_state: 161 | 162 | Returns: 163 | 164 | """ 165 | assert self.models is None, 'Is already trained' 166 | 167 | self.models = [] 168 | 169 | X, y, sample_weight, eval_sets = validate_input(X, y, sample_weight, {}) 170 | self._base_learner._infer_params() 171 | X_enc, max_bin, borders, eval_enc = self._base_learner.quantize(X, eval_sets) 172 | 173 | # create validation 174 | cv_iter = self.get_cv_iter(cv, stratify, random_state) 175 | 176 | # fit and free memory 177 | mempool = cp.cuda.MemoryPool() 178 | 179 | oof_pred, folds = self._fit_predict(mempool, X, X_enc, y, sample_weight, max_bin, borders, cv_iter) 180 | self.fit_cluster_tree(X, X_enc, y, sample_weight, max_bin, borders, folds) 181 | self.search_for_best_cluster(X, y, sample_weight, folds) 182 | 183 | # create out of fold pruned prediction 184 | 185 | for f in range(folds.max() + 1): 186 | idx = np.arange(X_enc.shape[0])[folds == f] 187 | X_test = X[idx] 188 | pred = self._get_stages([self.models[f]], self.best_oof_trees[f], X_test, batch_size=self.batch_size) 189 | oof_pred[idx] = self._prune_preds(self.best_oof_trees[f], X_test, pred, batch_size=self.batch_size) 190 | 191 | return oof_pred 192 | 193 | def fit_cluster_tree(self, X, X_enc, y, sample_weight, max_bin, borders, folds): 194 | """Fit cluster tree 195 | 196 | Args: 197 | X: 198 | X_enc: 199 | y: 200 | sample_weight: 201 | max_bin: 202 | borders: 203 | folds: 204 | 205 | Returns: 206 | 207 | """ 208 | paths = np.zeros((X_enc.shape[0], len(self.iters_to_fit)), dtype=np.float32) 209 | scorer = self.get_es_metric() 210 | 211 | for f in range(folds.max() + 1): 212 | idx = np.arange(X_enc.shape[0])[folds == f] 213 | val_pred = self.models[f].predict_staged(X[idx], iterations=self.iters_to_fit) 214 | paths[idx] = scorer(y[idx], val_pred, None if sample_weight is None else sample_weight[idx]) 215 | 216 | self.cluster.fit_quantized(X_enc, paths, max_bin, borders) 217 | self.cluster.to_cpu() 218 | 219 | def search_for_best_cluster(self, X, y, sample_weight, folds): 220 | """Search for the best cluster tree 221 | 222 | Args: 223 | X: 224 | y: 225 | sample_weight: 226 | folds: 227 | 228 | Returns: 229 | 230 | """ 231 | # predict cluster trees 232 | cl_ = self.cluster.predict(X) 233 | # zero clustering is a simple early stopping 234 | clusters = np.zeros((cl_.shape[0], cl_.shape[1] + 1), dtype=np.uint32) 235 | clusters[:, 1:] = cl_ 236 | 237 | scorer = self.get_es_metric() 238 | n_cand = clusters.shape[1] 239 | clust_per_split = clusters.max(axis=0) + 1 240 | nfolds = folds.max() + 1 241 | max_clust = clust_per_split.max() 242 | iter_num = self._base_learner.params['ntrees'] 243 | batch_size = 1000 244 | 245 | folds_stats = np.zeros((nfolds, n_cand, max_clust, iter_num), dtype=np.float32) 246 | 247 | # calculate oof errors 248 | for f in range(nfolds): 249 | idx = np.arange(X.shape[0])[folds == f] 250 | X_test, y_test, cl_test = X[idx], y[idx], clusters[idx] 251 | 252 | for i in range(0, X_test.shape[0], batch_size): 253 | 254 | val_pred = self.models[f].predict_staged(X_test[i:i + batch_size]) 255 | err = scorer(y_test[i:i + batch_size], val_pred, 256 | None if sample_weight is None else sample_weight[i:i + batch_size]) 257 | 258 | for j in range(n_cand): 259 | np.add.at(folds_stats[f, j], (cl_test[i:i + batch_size, j],), err) 260 | 261 | # select best by oof 262 | stats = folds_stats.sum(axis=0) # shape (nsplits, max_clust, niters) 263 | oof_stats = stats[np.newaxis, ...] - folds_stats # shape (nfolds, nsplits, max_clust, niters) 264 | 265 | best_iters = oof_stats.argmin(axis=-1) # shape (nfolds, nsplits, max_clust) 266 | best_errs = np.take_along_axis(folds_stats, best_iters[..., np.newaxis], axis=3)[..., 0].sum( 267 | axis=0) # shape (nsplits, max_clust) 268 | self.best_split = best_errs.sum(axis=1).argmin() # scalar 269 | best_oof_trees = best_iters[:, self.best_split] # shape (nfolds, max_clust) 270 | self.best_oof_trees = best_oof_trees[:, :clust_per_split[self.best_split]] 271 | 272 | # select best in total 273 | best_trees = stats[self.best_split].argmin(axis=-1) # shape (max_clust, ) 274 | self.best_trees = best_trees[:clust_per_split[self.best_split]] 275 | 276 | def _get_stages(self, models, iters, X, batch_size=100000): 277 | """ 278 | 279 | Args: 280 | models: 281 | iters: 282 | X: 283 | batch_size: 284 | 285 | Returns: 286 | 287 | """ 288 | sorted_iters = np.sort(np.unique(iters)) 289 | pred = models[0].predict_staged(X, iterations=sorted_iters, batch_size=batch_size) 290 | 291 | for i in range(1, len(models)): 292 | pred += models[i].predict_staged(X, iterations=sorted_iters, batch_size=batch_size) 293 | 294 | pred /= len(models) 295 | 296 | return pred 297 | 298 | def _prune_preds(self, iters, X, pred, batch_size=100000): 299 | """ 300 | 301 | Args: 302 | iters: 303 | X: 304 | pred: 305 | batch_size: 306 | 307 | Returns: 308 | 309 | """ 310 | if self.best_split == 0: 311 | cluster = np.zeros((X.shape[0],), dtype=np.uint32) 312 | else: 313 | cluster = self.cluster.predict(X, iterations=[self.best_split - 1], batch_size=batch_size)[:, 0] 314 | 315 | sorted_iters = np.sort(np.unique(iters)) 316 | order = np.searchsorted(sorted_iters, iters) 317 | 318 | return select_preds(pred, cluster, order) 319 | 320 | def predict(self, X, batch_size=100000): 321 | """ 322 | 323 | Args: 324 | X: 325 | batch_size: 326 | 327 | Returns: 328 | 329 | """ 330 | pred = self._get_stages(self.models, self.best_trees, X, batch_size=batch_size) 331 | return self._prune_preds(self.best_trees, X, pred, batch_size=batch_size) 332 | -------------------------------------------------------------------------------- /py_boost/cv/base.py: -------------------------------------------------------------------------------- 1 | """Gradient Boosting with built-in cross validation""" 2 | 3 | import numpy as np 4 | try: 5 | import cupy as cp 6 | except Exception: 7 | pass 8 | from copy import deepcopy 9 | 10 | from sklearn.model_selection import KFold, StratifiedKFold 11 | from ..gpu.utils import validate_input 12 | 13 | 14 | class CustomFolds: 15 | """ 16 | Class to imitate sklearn cv for custom folds 17 | """ 18 | 19 | def __init__(self, folds): 20 | self.folds = folds 21 | 22 | def split(self, *args, **kwargs): 23 | nfolds = int(self.folds.max() + 1) 24 | idx = np.arange(len(self.folds)) 25 | 26 | splits = [] 27 | 28 | for i in range(nfolds): 29 | splits.append((idx[self.folds != i], idx[self.folds == i])) 30 | 31 | return splits 32 | 33 | 34 | class CrossValidation: 35 | """ 36 | Cross validation wrapper for gradient boosting 37 | """ 38 | 39 | def __init__(self, base_learner, random_state=42): 40 | """ 41 | 42 | Args: 43 | base_learner: 44 | random_state: 45 | """ 46 | self._base_learner = base_learner 47 | self.random_state = random_state 48 | self.models = None 49 | 50 | def _fit_predict(self, mempool, X, X_enc, y, sample_weight, max_bin, borders, cv_iter): 51 | 52 | oof_pred = None 53 | folds = np.zeros(X.shape[0], dtype=np.int32) 54 | 55 | with cp.cuda.using_allocator(allocator=mempool.malloc): 56 | 57 | for n, (f0, f1) in enumerate(cv_iter.split(X, y)): 58 | 59 | # split data 60 | 61 | X_tr, X_enc_tr, y_tr, = X[f0], X_enc[f0], y[f0] 62 | 63 | sample_weight_tr = None 64 | if sample_weight is not None: 65 | sample_weight_tr = sample_weight[f0] 66 | 67 | eval_sets = [{ 68 | 69 | 'X': X[f1], 70 | 'y': y[f1], 71 | 'sample_weight': None if sample_weight is None else sample_weight[f1] 72 | 73 | }] 74 | 75 | eval_enc = [X_enc[f1]] 76 | 77 | # fit model 78 | model = deepcopy(self._base_learner) 79 | model._infer_params() 80 | builder, build_info = model._create_build_info(mempool, X_tr, X_enc_tr, y_tr, sample_weight_tr, 81 | max_bin, borders, eval_sets, eval_enc) 82 | model._fit(builder, build_info) 83 | 84 | # predict 85 | 86 | val_pred = model.predict(eval_sets[0]['X']) 87 | model.to_cpu() 88 | 89 | if oof_pred is None: 90 | oof_pred = np.zeros((X.shape[0], val_pred.shape[1]), dtype=np.float32) 91 | 92 | oof_pred[f1] = val_pred 93 | folds[f1] = n 94 | self.models.append(model) 95 | 96 | mempool.free_all_blocks() 97 | 98 | return oof_pred, folds 99 | 100 | def get_cv_iter(self, cv, stratify, random_state): 101 | 102 | if type(cv) in [int, float]: 103 | cv = int(cv) 104 | if stratify: 105 | folds = StratifiedKFold(cv, shuffle=True, random_state=random_state) 106 | else: 107 | folds = KFold(cv, shuffle=True) 108 | 109 | else: 110 | folds = CustomFolds(cv) 111 | 112 | return folds 113 | 114 | def fit_predict(self, X, y, sample_weight=None, cv=5, stratify=False, random_state=42): 115 | """ 116 | 117 | Args: 118 | X: 119 | y: 120 | sample_weight: 121 | cv: 122 | stratify: 123 | random_state: 124 | 125 | Returns: 126 | 127 | """ 128 | assert self.models is None, 'Is already trained' 129 | 130 | self.models = [] 131 | 132 | X, y, sample_weight, eval_sets = validate_input(X, y, sample_weight, {}) 133 | self._base_learner._infer_params() 134 | X_enc, max_bin, borders, eval_enc = self._base_learner.quantize(X, eval_sets) 135 | 136 | # create validation 137 | cv_iter = self.get_cv_iter(cv, stratify, random_state) 138 | 139 | # fit and free memory 140 | mempool = cp.cuda.MemoryPool() 141 | 142 | oof_pred, folds = self._fit_predict(mempool, X, X_enc, y, sample_weight, max_bin, borders, cv_iter) 143 | 144 | return oof_pred 145 | 146 | def predict(self, X): 147 | """ 148 | 149 | Args: 150 | X: 151 | 152 | Returns: 153 | 154 | """ 155 | res = None 156 | 157 | for model in self.models: 158 | 159 | pred = model.predict(X) 160 | if res is None: 161 | res = pred 162 | else: 163 | res += pred 164 | 165 | res /= len(self.models) 166 | 167 | return res 168 | -------------------------------------------------------------------------------- /py_boost/cv/cluster_tree.py: -------------------------------------------------------------------------------- 1 | from ..gpu.utils import * 2 | from ..gpu.tree import * 3 | from ..gpu.base import Ensemble 4 | 5 | from ..quantization.base import QuantileQuantizer 6 | 7 | 8 | def cluster_grow_tree(tree, group, arr, grad, hess, row_indexer, col_indexer, params): 9 | """Graw tree for advanced pruning 10 | 11 | Args: 12 | tree: 13 | group: 14 | arr: 15 | grad: 16 | hess: 17 | row_indexer: 18 | col_indexer: 19 | params: 20 | 21 | Returns: 22 | 23 | """ 24 | # create gh 25 | gh = cp.concatenate((grad, hess), axis=1) 26 | out_indexer = cp.arange(gh.shape[1], dtype=cp.uint64) 27 | 28 | # init nodes with single zero node 29 | unique_nodes = np.zeros(1, dtype=np.int32) 30 | # count unique nodes in active rows 31 | nodes_count = cp.ones(1, dtype=cp.uint64) * row_indexer.shape[0] 32 | # nodes for all rows 33 | nodes = cp.zeros(arr.shape[0], dtype=cp.int32) 34 | # index of node in unique array 35 | node_indexes = nodes 36 | prev_hist, small_index, big_index = [None] * 3 37 | 38 | for niter in range(params['max_depth']): 39 | 40 | nnodes = len(unique_nodes) 41 | gh_hist = histogram(arr, gh, node_indexes, 42 | col_indexer=col_indexer, 43 | row_indexer=row_indexer, 44 | out_indexer=out_indexer, 45 | nnodes=nnodes, 46 | max_bins=params['max_bin'], 47 | prev_hist=prev_hist, 48 | small_index=small_index, 49 | big_index=big_index) 50 | 51 | # assume hess is the last output 52 | 53 | hist, counts = gh_hist[:-1], gh_hist[-1] 54 | total = hist[..., :1, -1:] 55 | curr = total.min(axis=0) 56 | gain = cp.zeros(hist.shape[1:] + (2,), dtype=cp.float32) 57 | 58 | # NAN to left 59 | gain[..., 0] = curr - hist.min(axis=0) - (total - hist).min(axis=0) 60 | gain[..., 0] *= cp.minimum(counts, counts[..., -1:] - counts) >= params['min_data_in_leaf'] 61 | 62 | # NAN to right 63 | gain[..., 1] = curr - (hist - hist[..., :1]).min(axis=0) - (total - hist + hist[..., :1]).min(axis=0) 64 | gain[..., 1] *= cp.minimum(counts - counts[..., :1:], counts[..., -1:] - counts + counts[..., :1]) >= params[ 65 | 'min_data_in_leaf'] 66 | 67 | best_feat, best_gain, best_split, best_nan_left = get_best_split(gain, col_indexer) 68 | 69 | # move to CPU and apply min_gain_to_split condition 70 | unique_nodes, new_nodes_id, best_feat, best_gain, best_split, best_nan_left, is_valid_node = \ 71 | get_cpu_splitters(unique_nodes, best_feat, best_gain, best_split, best_nan_left, 72 | params['min_gain_to_split']) 73 | # if all nodes are not valid to split - exit 74 | if len(unique_nodes) == 0: 75 | break 76 | # write node info to the Tree 77 | tree.set_nodes(group, unique_nodes, new_nodes_id, best_feat, best_gain, best_split, best_nan_left) 78 | # get args back on gpu 79 | split_args, unique_nodes = get_gpu_splitters(unique_nodes, new_nodes_id, 80 | best_feat, best_split, best_nan_left) 81 | 82 | # perform split for train set 83 | nodes, node_indexes = make_split(nodes, arr, *split_args, return_pos=True) 84 | 85 | # update info for the next step 86 | if niter < (params['max_depth'] - 1): 87 | # update counts 88 | nodes_count = cp.zeros((unique_nodes.shape[0] + 1,), dtype=np.uint64) 89 | nodes_count.scatter_add(node_indexes[row_indexer], 1) 90 | nodes_count = nodes_count[:-1] 91 | 92 | cpu_counts = nodes_count.get() 93 | 94 | # remove unused rows from indexer 95 | if cpu_counts.sum() < row_indexer.shape[0]: 96 | row_indexer = row_indexer[isin(nodes, split_args[1].ravel(), index=row_indexer)] 97 | 98 | # save histogram for the subs trick 99 | prev_hist, small_index, big_index = get_prev_hist(cpu_counts, 100 | gh_hist, cp.asarray(is_valid_node)) 101 | 102 | return nodes 103 | 104 | 105 | class ClusterTreeBuilder: 106 | """Tree builder for early stopping clusters""" 107 | 108 | def __init__(self, borders, 109 | **tree_params 110 | ): 111 | """ 112 | 113 | Args: 114 | borders: list of np.ndarray, actual split borders for quantized features 115 | **tree_params: other tree building parameters 116 | """ 117 | self.borders = borders 118 | 119 | self.params = {**{ 120 | 121 | 'max_bin': 256, 122 | 'max_depth': 6, 123 | 'min_data_in_leaf': 10, 124 | 'min_gain_to_split': 0 125 | 126 | }, **tree_params} 127 | 128 | def build_tree(self, X, y): 129 | """Build tree 130 | 131 | Args: 132 | X: cp.ndarray, quantized feature matrix 133 | y: cp.ndarray, loss path matrix 134 | 135 | 136 | Returns: 137 | tree, Tree, constructed tree 138 | """ 139 | 140 | col_indexer = cp.arange(X.shape[1], dtype=cp.uint64) 141 | row_indexer = cp.arange(X.shape[0], dtype=cp.uint64) 142 | max_nodes = int((2 ** np.arange(self.params['max_depth'] + 1)).sum()) 143 | tree = Tree(max_nodes, y.shape[1], 1) 144 | # grow single group of the tree and get nodes index 145 | cluster_grow_tree(tree, 0, X, y, cp.ones((y.shape[0], 1), dtype=cp.float32), 146 | row_indexer, col_indexer, self.params) 147 | 148 | tree.set_borders(self.borders) 149 | tree.set_leaves() 150 | tree.set_node_values(np.zeros((max_nodes, 1), dtype=np.float32), np.zeros((1,), dtype=np.uint64)) 151 | 152 | return tree 153 | 154 | 155 | class ClusterCandidates(Ensemble): 156 | """ 157 | Ensemble of cluster candidates 158 | """ 159 | 160 | def __init__(self, depth_range=range(1, 7), min_data_in_leaf=100, debug=False): 161 | super().__init__() 162 | 163 | self._debug = debug 164 | self.depth_range = depth_range 165 | self.min_data_in_leaf = min_data_in_leaf 166 | self.max_clust = 2 ** max(depth_range) 167 | 168 | def fit(self, X, y): 169 | X, y, sample_weight, eval_sets = validate_input(X, y, None, []) 170 | mempool = cp.cuda.MemoryPool() 171 | with cp.cuda.using_allocator(allocator=mempool.malloc): 172 | # TODO: move quantizer to the Ensemble 173 | quantizer = QuantileQuantizer(sample=self.quant_sample, max_bin=self.max_bin) 174 | X_enc, max_bin, borders, eval_enc = self.quantize(X, eval_sets) 175 | 176 | self.fit_quantized(X_enc, y, max_bin, borders) 177 | mempool.free_all_blocks() 178 | 179 | return self 180 | 181 | def fit_quantized(self, X_enc, y, max_bin, borders): 182 | y = cp.array(y, order='C', dtype=cp.float32) 183 | X_cp = pad_and_move(X_enc) 184 | self.models = [] 185 | 186 | for d in self.depth_range: 187 | builder = ClusterTreeBuilder(borders, max_depth=d, min_data_in_leaf=self.min_data_in_leaf, max_bin=max_bin) 188 | 189 | tree = builder.build_tree(X_cp, y) 190 | tree.reformat(nfeats=X_cp.shape[1], debug=self._debug) 191 | self.models.append(tree) 192 | 193 | self.base_score = np.zeros((1,), dtype=np.float32) 194 | 195 | return self 196 | 197 | def predict(self, X, iterations=None, batch_size=100000): 198 | return self.predict_leaves(X, iterations=iterations, batch_size=batch_size)[..., 0].T 199 | -------------------------------------------------------------------------------- /py_boost/gpu/__init__.py: -------------------------------------------------------------------------------- 1 | """Contains the core functions and classes""" 2 | 3 | from .boosting import GradientBoosting 4 | 5 | # __all__ = ['GradientBoosting'] 6 | -------------------------------------------------------------------------------- /py_boost/gpu/boosting.py: -------------------------------------------------------------------------------- 1 | """Gradient boosting builder""" 2 | 3 | try: 4 | import cupy as cp 5 | except Exception: 6 | pass 7 | from .base import Ensemble 8 | from .losses import loss_alias 9 | from .tree import DepthwiseTreeBuilder 10 | from .utils import pad_and_move, validate_input 11 | from ..callbacks.callback import EarlyStopping, EvalHistory, CallbackPipeline 12 | from ..multioutput.sketching import GradSketch 13 | from ..multioutput.target_splitter import SingleSplitter, OneVsAllSplitter 14 | from ..sampling.bagging import BaseSampler 15 | 16 | 17 | class GradientBoosting(Ensemble): 18 | """Basic Gradient Boosting on depthwise trees""" 19 | 20 | def __init__(self, loss, 21 | metric=None, 22 | ntrees=100, 23 | lr=0.05, 24 | min_gain_to_split=0, 25 | lambda_l2=1, 26 | gd_steps=1, 27 | 28 | max_depth=6, 29 | min_data_in_leaf=10, 30 | colsample=1., 31 | subsample=1., 32 | target_splitter='Single', 33 | multioutput_sketch=None, 34 | use_hess=True, 35 | 36 | quantization='Quantile', 37 | quant_sample=2000000, 38 | max_bin=256, 39 | min_data_in_bin=3, 40 | 41 | es=100, 42 | seed=42, 43 | verbose=10, 44 | callbacks=None, 45 | 46 | debug=False 47 | ): 48 | """ 49 | 50 | Args: 51 | loss: str or Loss, loss function 52 | metric: None or str or Metric, metric 53 | ntrees: int, maximum number of trees 54 | lr: float, learning rate 55 | min_gain_to_split: float >=0, minimal gain to split 56 | lambda_l2: float > 0, l2 leaf regularization 57 | gd_steps: int > 0, number of gradient steps while computing leaf values 58 | 59 | max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets 60 | min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated 61 | with hessian values to speed up training 62 | colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling 63 | subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling 64 | quant_sample: int, subsample to quantize features 65 | target_splitter: str or Callable, target splitter, defined multioutput strategy: 66 | 'Single', 'OneVsAll' or custom 67 | multioutput_sketch: None or Callable. Defines the sketching strategy to simplify scoring function 68 | in multioutput case. If None full scoring function is used 69 | use_hess: If True hessians will be used in tree structure search 70 | 71 | quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform', 72 | 'Uniquant' or custom implementation 73 | quant_sample: int, subsample to quantize features 74 | max_bin: int in [2, 256] maximum number of bins to quantize features 75 | min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored 76 | 77 | es: int, early stopping rounds. If 0, no early stopping 78 | seed: int, random state 79 | verbose: int, verbosity freq 80 | callbacks: list of Callback, callbacks to customize training are passed here 81 | 82 | debug: bool, if debug mode is enabled (it removes ability to use deprecated functions, 83 | thus optimizing memory usage) 84 | """ 85 | 86 | super().__init__() 87 | 88 | self.params = { 89 | 90 | 'loss': loss, 91 | 'metric': metric, 92 | 'ntrees': ntrees, 93 | 'lr': lr, 94 | 'min_gain_to_split': min_gain_to_split, 95 | 'lambda_l2': lambda_l2, 96 | 'gd_steps': gd_steps, 97 | 98 | 'max_depth': max_depth, 99 | 'min_data_in_leaf': min_data_in_leaf, 100 | 'colsample': colsample, 101 | 'subsample': subsample, 102 | 103 | 'target_splitter': target_splitter, 104 | 'multioutput_sketch': multioutput_sketch, 105 | 'use_hess': use_hess, 106 | 107 | 'quantization': quantization, 108 | 'quant_sample': quant_sample, 109 | 'max_bin': max_bin, 110 | 'min_data_in_bin': min_data_in_bin, 111 | 112 | 'es': es, 113 | 'seed': seed, 114 | 'verbose': verbose, 115 | 'callbacks': callbacks, 116 | 117 | 'debug': debug 118 | } 119 | 120 | def _infer_params(self): 121 | 122 | self.ntrees = self.params['ntrees'] 123 | self.lr = self.params['lr'] 124 | 125 | assert self.params['min_gain_to_split'] >= 0, 'Param min_gain_to_split should be >= 0' 126 | 127 | self.min_gain_to_split = self.params['min_gain_to_split'] 128 | self.lambda_l2 = self.params['lambda_l2'] 129 | self.gd_steps = self.params['gd_steps'] 130 | 131 | self.max_depth = self.params['max_depth'] 132 | self.min_data_in_leaf = self.params['min_data_in_leaf'] 133 | self.use_hess = self.params['use_hess'] 134 | 135 | self.colsample = self.params['colsample'] 136 | if type(self.params['colsample']) in [float, int]: 137 | self.colsample = BaseSampler(self.params['colsample'], axis=1) 138 | 139 | self.subsample = self.params['subsample'] 140 | if type(self.params['subsample']) in [float, int]: 141 | self.subsample = BaseSampler(self.params['subsample'], axis=0) 142 | 143 | if self.params['target_splitter'] == 'Single': 144 | splitter = SingleSplitter() 145 | elif self.params['target_splitter'] == 'OneVsAll': 146 | splitter = OneVsAllSplitter() 147 | else: 148 | splitter = self.params['target_splitter'] 149 | 150 | self.target_splitter = splitter 151 | 152 | self.multioutput_sketch = self.params['multioutput_sketch'] 153 | if self.params['multioutput_sketch'] is None: 154 | self.multioutput_sketch = GradSketch() 155 | 156 | self.quantization = self.params['quantization'] 157 | self.quant_sample = self.params['quant_sample'] 158 | self.max_bin = self.params['max_bin'] 159 | self.min_data_in_bin = self.params['min_data_in_bin'] 160 | 161 | self.es = self.params['es'] 162 | self.verbose = self.params['verbose'] 163 | 164 | self.loss = self.params['loss'] 165 | if type(self.params['loss']) is str: 166 | self.loss = loss_alias[self.params['loss']] 167 | 168 | self.postprocess_fn = self.loss.postprocess_output 169 | 170 | self.metric = self.params['metric'] 171 | if self.params['metric'] is None or type(self.params['metric']) is str: 172 | self.metric = self.loss.get_metric_from_string(self.params['metric']) 173 | self.seed = self.params['seed'] 174 | 175 | self.history = [] 176 | 177 | self.callbacks = CallbackPipeline( 178 | 179 | self.subsample, 180 | self.colsample, 181 | self.target_splitter, 182 | self.multioutput_sketch, 183 | *([] if self.params['callbacks'] is None else self.params['callbacks']), 184 | EvalHistory(self.history, verbose=self.params['verbose']), 185 | EarlyStopping(self.params['es']), 186 | ) 187 | 188 | def _fit(self, builder, build_info): 189 | """Fit with tree builder and build info 190 | 191 | Args: 192 | builder: DepthwiseTreeBuilder 193 | build_info: build info state dict 194 | 195 | Returns: 196 | 197 | """ 198 | train, valid = build_info['data']['train'], build_info['data']['valid'] 199 | self.callbacks.before_train(build_info) 200 | 201 | for i in range(self.ntrees): 202 | 203 | build_info['num_iter'] = i 204 | train['grad'], train['hess'] = self.loss(train['target'], train['ensemble']) 205 | 206 | self.callbacks.before_iteration(build_info) 207 | 208 | tree, leaves, preds, val_leaves, val_preds = \ 209 | builder.build_tree(train['features_gpu'], 210 | train['grad'], 211 | train['hess'], 212 | train['sample_weight'], 213 | lambda x: self.loss(train['target'], train['ensemble'] + x), 214 | *valid['features_gpu']) 215 | 216 | # update ensemble 217 | train['ensemble'] += preds 218 | for vp, tp in zip(valid['ensemble'], val_preds): 219 | vp += tp 220 | 221 | train['last_tree'] = { 222 | 223 | 'leaves': leaves, 224 | 'preds': preds 225 | 226 | } 227 | valid['last_tree'] = { 228 | 229 | 'leaves': val_leaves, 230 | 'preds': val_preds 231 | 232 | } 233 | 234 | self.models.append(tree) 235 | # check exit info 236 | if self.callbacks.after_iteration(build_info): 237 | tree.reformat(nfeats=self.nfeats, debug=self.params['debug']) 238 | break 239 | tree.reformat(nfeats=self.nfeats, debug=self.params['debug']) 240 | 241 | self.callbacks.after_train(build_info) 242 | self.base_score = self.base_score.get() 243 | 244 | def fit(self, X, y, sample_weight=None, eval_sets=None): 245 | """Fit model 246 | 247 | Args: 248 | X: np.ndarray feature matrix 249 | y: np.ndarray of target 250 | sample_weight: np.ndarray of sample weights 251 | eval_sets: list of dict of eval sets. Ex [{'X':X0, 'y': y0, 'sample_weight': w0}, ...}] 252 | 253 | Returns: 254 | trained instance 255 | """ 256 | self._infer_params() 257 | 258 | X, y, sample_weight, eval_sets = validate_input(X, y, sample_weight, eval_sets) 259 | # fit and free memory 260 | mempool = cp.cuda.MemoryPool() 261 | with cp.cuda.using_allocator(allocator=mempool.malloc): 262 | # quantize 263 | X_enc, max_bin, borders, eval_enc = self.quantize(X, eval_sets) 264 | # create build info 265 | builder, build_info = self._create_build_info(mempool, X, X_enc, y, sample_weight, 266 | max_bin, borders, eval_sets, eval_enc) 267 | self._fit(builder, build_info) 268 | mempool.free_all_blocks() 269 | 270 | return self 271 | 272 | def _create_build_info(self, mempool, X, X_enc, y, sample_weight, max_bin, borders, eval_sets, eval_enc): 273 | """Quantize data, create tree builder and build_info 274 | 275 | Args: 276 | mempool: cp.cuda.MemoryPool, memory pool to use 277 | X: np.ndarray feature matrix 278 | y: np.ndarray of target 279 | sample_weight: np.ndarray of sample weights 280 | eval_sets: list of dict of eval sets. Ex [{'X':X0, 'y': y0, 'sample_weight': w0}, ...}] 281 | 282 | Returns: 283 | DepthwiseTreeBuilder, build_info 284 | """ 285 | # quantization 286 | 287 | y = cp.array(y, order='C', dtype=cp.float32) 288 | 289 | if sample_weight is not None: 290 | sample_weight = cp.array(sample_weight, order='C', dtype=cp.float32) 291 | 292 | X_cp = pad_and_move(X_enc) 293 | 294 | X_val = [cp.array(x, order='C') for x in eval_enc] 295 | y_val = [cp.array(x['y'], order='C', dtype=cp.float32) for x in eval_sets] 296 | w_val = [None if x['sample_weight'] is None else cp.array(x['sample_weight'], order='C', dtype=cp.float32) 297 | for x in eval_sets] 298 | 299 | # save nfeatures for the feature importances 300 | self.nfeats = X.shape[1] 301 | 302 | builder = DepthwiseTreeBuilder(borders, 303 | use_hess=self.use_hess, 304 | colsampler=self.colsample, 305 | subsampler=self.subsample, 306 | target_splitter=self.target_splitter, 307 | multioutput_sketch=self.multioutput_sketch, 308 | gd_steps=self.gd_steps, 309 | lr=self.lr, 310 | min_gain_to_split=self.min_gain_to_split, 311 | min_data_in_leaf=self.min_data_in_leaf, 312 | lambda_l2=self.lambda_l2, 313 | max_depth=self.max_depth, 314 | max_bin=max_bin, 315 | ) 316 | cp.random.seed(self.seed) 317 | 318 | y = self.loss.preprocess_input(y) 319 | y_val = [self.loss.preprocess_input(x) for x in y_val] 320 | self.base_score = self.loss.base_score(y) 321 | 322 | # init ensembles 323 | ens = cp.empty((y.shape[0], self.base_score.shape[0]), order='C', dtype=cp.float32) 324 | ens[:] = self.base_score 325 | # init val ens 326 | val_ens = [cp.empty((x.shape[0], self.base_score.shape[0]), order='C') for x in y_val] 327 | for ve in val_ens: 328 | ve[:] = self.base_score 329 | 330 | self.models = [] 331 | 332 | build_info = { 333 | 'data': { 334 | 'train': { 335 | 'features_cpu': X, 336 | 'features_gpu': X_cp, 337 | 'target': y, 338 | 'sample_weight': sample_weight, 339 | 'ensemble': ens, 340 | 'grad': None, 341 | 'hess': None 342 | }, 343 | 'valid': { 344 | 'features_cpu': [dat['X'] for dat in eval_sets], 345 | 'features_gpu': X_val, 346 | 'target': y_val, 347 | 'sample_weight': w_val, 348 | 'ensemble': val_ens, 349 | } 350 | }, 351 | 'borders': borders, 352 | 'model': self, 353 | 'mempool': mempool, 354 | 'builder': builder 355 | } 356 | 357 | return builder, build_info 358 | 359 | def load(self, file): 360 | """Load weights fromm file 361 | 362 | Args: 363 | file: str, file path 364 | 365 | Returns: 366 | Py-Boost GradientBoosting 367 | """ 368 | self._infer_params() 369 | 370 | return super().load(file) 371 | -------------------------------------------------------------------------------- /py_boost/gpu/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import * 2 | from .metrics import * 3 | from .multiclass_metrics import * 4 | 5 | __all__ = [ 6 | 7 | 'loss_alias', 8 | 'Loss', 9 | 'MSELoss', 10 | 'MSLELoss', 11 | 'BCELoss', 12 | 'CrossEntropyLoss', 13 | 14 | 'Metric', 15 | 'RMSEMetric', 16 | 'RMSLEMetric', 17 | 'R2Score', 18 | 'BCEMetric', 19 | 'AccuracyMetric', 20 | 'RocAucMetric', 21 | 22 | 'Precision', 23 | 'Recall', 24 | 'F1Score', 25 | 26 | 'MultiAccuracyMetric', 27 | 'MultiPrecision', 28 | 'MultiRecall', 29 | 'MultiF1Score' 30 | 31 | ] 32 | -------------------------------------------------------------------------------- /py_boost/gpu/losses/losses.py: -------------------------------------------------------------------------------- 1 | """Common losses""" 2 | 3 | import numpy as np 4 | try: 5 | import cupy as cp 6 | CUDA_FOUND = True 7 | except Exception: 8 | CUDA_FOUND = False 9 | 10 | from .metrics import metric_alias, RMSEMetric, RMSLEMetric, BCEMetric 11 | from .multiclass_metrics import multiclass_metric_alias, CrossEntropyMetric 12 | 13 | 14 | class Loss: 15 | """Base class to define loss function""" 16 | 17 | def get_grad_hess(self, y_true, y_pred): 18 | """Method implements how to calculate gradients and hessians. 19 | Output gradient should have the shape (n_samples, n_outputs) 20 | Output hessian should have the shape (n_samples, n_outputs) or (n_samples, 1) 21 | if the same hess used for all outputs (for ex. MSELoss) 22 | 23 | Definition don't use sample_weight, because it is applied later at the tree building stage 24 | 25 | Args: 26 | y_true: cp.ndarray, target 27 | y_pred: cp.ndarray, current prediction 28 | 29 | Returns: 30 | 31 | """ 32 | raise NotImplementedError 33 | 34 | def __call__(self, y_true, y_pred): 35 | grad, hess = self.get_grad_hess(y_true, y_pred) 36 | return grad, hess 37 | 38 | def preprocess_input(self, y_true): 39 | """Method defines how raw input target variable should be processed before train starts 40 | (ex. applying log1p for MSLELoss) 41 | 42 | Args: 43 | y_true: cp.ndarray, raw target 44 | 45 | Returns: 46 | 47 | """ 48 | return y_true 49 | 50 | def postprocess_output(self, y_pred): 51 | """Method defines how to postprocess sum of trees to output prediction (ex. apply sigmoid for BCELoss) 52 | 53 | Args: 54 | y_pred: cp.ndarray, predictions 55 | 56 | Returns: 57 | 58 | """ 59 | return y_pred 60 | 61 | def base_score(self, y_true): 62 | """Method defines how to initialize an empty ensemble (ex. initialize with mean values for MSELoss) 63 | 64 | Args: 65 | y_true: cp.ndarray, processed target (after applying preprocess_input) 66 | 67 | Returns: 68 | 69 | """ 70 | raise NotImplementedError 71 | 72 | def get_metric_from_string(self, name=None): 73 | """Method defines how to interpret eval metric given in str format or None. 74 | For ex. you can define the default metric to use here if name is None 75 | 76 | Args: 77 | name: 78 | 79 | Returns: 80 | 81 | """ 82 | return metric_alias['name'] 83 | 84 | 85 | class MSELoss(Loss): 86 | """MSE Loss function for regression/multiregression""" 87 | 88 | def get_grad_hess(self, y_true, y_pred): 89 | return (y_pred - y_true), cp.ones((y_true.shape[0], 1), dtype=cp.float32) 90 | 91 | def base_score(self, y_true): 92 | return y_true.mean(axis=0) 93 | 94 | def get_metric_from_string(self, name=None): 95 | if name is None: 96 | return RMSEMetric() 97 | return metric_alias[name] 98 | 99 | 100 | class MSLELoss(Loss): 101 | """MSLE Loss function for regression/multiregression""" 102 | 103 | def preprocess_input(self, y_true): 104 | assert (y_true >= 0).all(), 'Inputs for msle should be non negative' 105 | 106 | return y_true 107 | 108 | def get_grad_hess(self, y_true, y_pred): 109 | y_true = cp.log1p(y_true) 110 | 111 | return (y_pred - y_true), cp.ones((y_true.shape[0], 1), dtype=cp.float32) 112 | 113 | def postprocess_output(self, y_pred): 114 | return cp.expm1(y_pred) 115 | 116 | def base_score(self, y_true): 117 | y_true = cp.log1p(y_true) 118 | return y_true.mean(axis=0) 119 | 120 | def get_metric_from_string(self, name=None): 121 | if name is None: 122 | return RMSLEMetric() 123 | return metric_alias[name] 124 | 125 | 126 | class BCELoss(Loss): 127 | """LogLoss for binary/multilabel classification""" 128 | 129 | def __init__(self, clip_value=1e-7): 130 | self.clip_value = clip_value 131 | 132 | def base_score(self, y_true): 133 | means = cp.clip(y_true.mean(axis=0), self.clip_value, 1 - self.clip_value) 134 | return cp.log(means / (1 - means)) 135 | 136 | def get_grad_hess(self, y_true, y_pred): 137 | pred = 1 / (1 + cp.exp(-y_pred)) 138 | pred = cp.clip(pred, self.clip_value, 1 - self.clip_value) 139 | grad = pred - y_true 140 | hess = pred * (1 - pred) 141 | 142 | return grad, hess 143 | 144 | def postprocess_output(self, y_pred): 145 | xp = np if type(y_pred) is np.ndarray else cp 146 | pred = 1 / (1 + xp.exp(-y_pred)) 147 | pred = xp.clip(pred, self.clip_value, 1 - self.clip_value) 148 | 149 | return pred 150 | 151 | def get_metric_from_string(self, name=None): 152 | if name is None: 153 | return BCEMetric() 154 | return metric_alias[name] 155 | 156 | 157 | def softmax(x, clip_val=1e-5): 158 | 159 | xp = np if type(x) is np.ndarray else cp 160 | exp_p = xp.exp(x - x.max(axis=1, keepdims=True)) 161 | 162 | return xp.clip(exp_p / exp_p.sum(axis=1, keepdims=True), clip_val, 1 - clip_val) 163 | 164 | 165 | # multiclass losses 166 | 167 | ce_grad_kernel = cp.ElementwiseKernel( 168 | 'T pred, raw S label, raw S nlabels, T factor', 169 | 'T grad, T hess', 170 | 171 | """ 172 | int y_pr = i % nlabels; 173 | int y_tr = label[i / nlabels]; 174 | 175 | grad = pred - (float) (y_pr == y_tr); 176 | hess = pred * (1. - pred) * factor; 177 | 178 | """, 179 | "ce_grad_kernel" 180 | ) if CUDA_FOUND else None 181 | 182 | 183 | def ce_grad(y_true, y_pred): 184 | factor = y_pred.shape[1] / (y_pred.shape[1] - 1) 185 | grad, hess = ce_grad_kernel(y_pred, y_true, y_pred.shape[1], factor) 186 | 187 | return grad, hess 188 | 189 | 190 | class CrossEntropyLoss(Loss): 191 | """CrossEntropy for multiclass classification""" 192 | 193 | def __init__(self, clip_value=1e-6): 194 | self.clip_value = clip_value 195 | 196 | def base_score(self, y_true): 197 | num_classes = int(y_true.max() + 1) 198 | hist = cp.zeros((num_classes,), dtype=cp.float32) 199 | 200 | return hist 201 | 202 | def get_grad_hess(self, y_true, y_pred): 203 | pred = softmax(y_pred, self.clip_value) 204 | grad, hess = ce_grad(y_true, pred) 205 | return grad, hess 206 | 207 | def postprocess_output(self, y_pred): 208 | 209 | return softmax(y_pred, self.clip_value) 210 | 211 | def preprocess_input(self, y_true): 212 | return y_true[:, 0].astype(cp.int32) 213 | 214 | def get_metric_from_string(self, name=None): 215 | if name is None: 216 | return CrossEntropyMetric() 217 | return multiclass_metric_alias[name] 218 | 219 | 220 | loss_alias = { 221 | 222 | # for bce 223 | 'binary': BCELoss(), 224 | 'bce': BCELoss(), 225 | 'multilabel': BCELoss(), 226 | 'logloss': BCELoss(), 227 | 228 | # for multiclass 229 | 'multiclass': CrossEntropyLoss(), 230 | 'crossentropy': CrossEntropyLoss(), 231 | 232 | # for regression 233 | 'mse': MSELoss(), 234 | 'regression': MSELoss(), 235 | 'l2': MSELoss(), 236 | 'multitask': MSELoss(), 237 | 'msle': MSLELoss() 238 | 239 | } 240 | -------------------------------------------------------------------------------- /py_boost/gpu/losses/metrics.py: -------------------------------------------------------------------------------- 1 | """Common metrics""" 2 | 3 | try: 4 | import cupy as cp 5 | except Exception: 6 | pass 7 | 8 | 9 | class Metric: 10 | """Base class to define eval metric function. 11 | Metric could be defined in 2 ways: 12 | 13 | - redefine .error method. Preferred if possible. Simplified metric definition by calculating error function 14 | for each point (ex. see RMSEMetric). If metric is defined via .error it also could be used with AdvancedES 15 | 16 | - redefine __call__ method. Used for more complex functions, like ROC-AUC. Handling sample_weight 17 | should be done manually here if needed 18 | 19 | 20 | """ 21 | alias = 'score' # defines how metric will be named in the output log 22 | 23 | def error(self, y_true, y_pred): 24 | """Simplified metric definition via individual objects error 25 | 26 | Args: 27 | y_true: cp.array, targets 28 | y_pred: cp.array, predictions 29 | 30 | Returns: 31 | float, metric value 32 | """ 33 | raise ValueError('Pointwise error is not implemented for this metric') 34 | 35 | def __call__(self, y_true, y_pred, sample_weight=None): 36 | """Full metric definition. Default is just weighted aggregation of pointwise errors 37 | 38 | Args: 39 | y_true: cp.array, targets 40 | y_pred: cp.array, predictions 41 | sample_weight: None or cp.ndarray, weights 42 | 43 | Returns: 44 | float, metric value 45 | """ 46 | err = self.error(y_true, y_pred) 47 | shape = err.shape 48 | assert shape[0] == y_true.shape[0], 'Error shape should match target shape at first dim' 49 | 50 | if len(shape) == 1: 51 | err = err[:, cp.newaxis] 52 | 53 | if sample_weight is None: 54 | return err.mean() 55 | 56 | err = (err.mean(axis=1, keepdims=True) * sample_weight).sum() / sample_weight.sum() 57 | return err 58 | 59 | def compare(self, v0, v1): 60 | """Method defines how to decide if metric was improved. Commonly it should be one of 'v0 > v1' or 'v0 < v1 ' 61 | 62 | Args: 63 | v0: float, metric value 64 | v1: float, metric value 65 | 66 | Returns: 67 | bool, if v0 improves score against v1 68 | """ 69 | raise NotImplementedError 70 | 71 | def argmax(self, arr): 72 | """Select best metric from list of metrics based on .compare method 73 | 74 | Args: 75 | arr: list of float, metric values 76 | 77 | Returns: 78 | int, position of the best metric value 79 | """ 80 | best = arr[0] 81 | best_n = 0 82 | 83 | for n, val in enumerate(arr[1:], 1): 84 | if self.compare(val, best): 85 | best = val 86 | best_n = n 87 | 88 | return best_n 89 | 90 | 91 | class RMSEMetric(Metric): 92 | """RMSE Metric for the regression/multiregression task""" 93 | alias = 'rmse' 94 | 95 | def error(self, y_true, y_pred): 96 | return (y_true - y_pred) ** 2 97 | 98 | def __call__(self, y_true, y_pred, sample_weight=None): 99 | return super().__call__(y_true, y_pred, sample_weight) ** .5 100 | 101 | def compare(self, v0, v1): 102 | return v0 < v1 103 | 104 | 105 | class R2Score(RMSEMetric): 106 | """R2 Score Metric for the regression/multiregression task""" 107 | alias = 'R2_score' 108 | 109 | def __call__(self, y_true, y_pred, sample_weight=None): 110 | 111 | if sample_weight is not None: 112 | err = ((y_true - y_pred) ** 2 * sample_weight).sum(axis=0) / sample_weight.sum() 113 | std = ((y_true - y_true.mean(axis=0)) ** 2 * sample_weight).sum(axis=0) / sample_weight.sum() 114 | else: 115 | err = ((y_true - y_pred) ** 2).mean(axis=0) 116 | std = y_true.var(axis=0) 117 | 118 | return (1 - err / std).mean() 119 | 120 | def compare(self, v0, v1): 121 | return v0 > v1 122 | 123 | 124 | class RMSLEMetric(RMSEMetric): 125 | """RMSLE Metric for the regression/multiregression classification task""" 126 | alias = 'rmsle' 127 | 128 | def error(self, y_true, y_pred): 129 | return super().error(cp.log1p(y_true), cp.log1p(y_pred)) 130 | 131 | 132 | class BCEMetric(Metric): 133 | """LogLoss for the binary/multilabel classification task""" 134 | alias = 'BCE' 135 | 136 | def error(self, y_true, y_pred): 137 | return -cp.log(y_true * y_pred + (1 - y_pred) * (1 - y_true)) 138 | 139 | def compare(self, v0, v1): 140 | return v0 < v1 141 | 142 | 143 | def auc(y, x, sample_weight=None): 144 | """Roc-auc score via cupy 145 | 146 | Args: 147 | y: cp.ndarray, 1d prediction 148 | x: cp.ndarray, 1d binary target 149 | sample_weight: optional 1d array of sample weights 150 | 151 | Returns: 152 | float, roc-auc metric value 153 | """ 154 | unique_x = cp.unique(x) 155 | 156 | if unique_x.shape[0] <= 1: 157 | return 0.5 158 | 159 | if sample_weight is None: 160 | sample_weight = cp.ones_like(y) 161 | 162 | rank_x = cp.searchsorted(unique_x, x) 163 | 164 | sum_1 = cp.zeros_like(unique_x, dtype=cp.float64) 165 | sum_1.scatter_add(rank_x, sample_weight * y) 166 | 167 | sum_0 = cp.zeros_like(unique_x, dtype=cp.float64) 168 | sum_0.scatter_add(rank_x, sample_weight * (1 - y)) 169 | 170 | cs_0 = sum_0.cumsum() 171 | auc_ = (cs_0 - sum_0 / 2) * sum_1 172 | 173 | tot = cs_0[-1] * sum_1.sum() 174 | 175 | return float(auc_.sum() / tot) 176 | 177 | 178 | class RocAucMetric(Metric): 179 | """Roc-auc metric for the binary classification task""" 180 | alias = 'AUC' 181 | 182 | def __call__(self, y_true, y_pred, sample_weight=None): 183 | """ 184 | 185 | Args: 186 | y_true: cp.ndarray of targets 187 | y_pred: cp.ndarray of predictions 188 | sample_weight: None or cp.ndarray of sample_weights 189 | 190 | Returns: 191 | 192 | """ 193 | assert y_pred.shape[1] == 1, 'Multioutput is not supported' 194 | 195 | if sample_weight is not None: 196 | sample_weight = sample_weight[:, 0] 197 | 198 | return auc(y_true[:, 0], y_pred[:, 0], sample_weight) 199 | 200 | def compare(self, v0, v1): 201 | return v0 > v1 202 | 203 | 204 | class ThresholdMetrics(Metric): 205 | """Basic class to handle metrics, that accept class label prediction as input""" 206 | 207 | def __init__(self, threshold=0.5, q=None): 208 | """Define binarization rule. If quantile is given, threshold defined with quantile 209 | 210 | Args: 211 | threshold: float, threshold value 212 | q: None or float, quantile threshold 213 | """ 214 | self.threshold = threshold 215 | self.q = q 216 | 217 | def get_label(self, y_pred): 218 | """Get labels from probabilities 219 | 220 | Args: 221 | y_pred: cp.ndarray, predictions 222 | 223 | Returns: 224 | cp.ndarray, predicted class labels 225 | """ 226 | threshold = self.threshold 227 | if self.q is not None: 228 | threshold = cp.quantile(y_pred, self.q, axis=0, interpolation='higher') 229 | 230 | return y_pred >= threshold 231 | 232 | def get_stats(self, y_true, y_pred, sample_weight=None, mode='f1'): 233 | """Helpful utils to calc Precision/Recall/F1 234 | 235 | Args: 236 | y_true: cp.ndarray, target 237 | y_pred: cp.ndarray, predicted class label 238 | sample_weight: None or cp.ndarray, weights 239 | mode: 240 | 241 | Returns: 242 | 243 | """ 244 | 245 | y_pred = self.get_label(y_pred) 246 | true = y_pred == y_true 247 | 248 | tp = true * y_pred 249 | if sample_weight is not None: 250 | tp = tp * sample_weight 251 | tp = tp.sum(axis=0) 252 | 253 | if mode == 'p': 254 | if sample_weight is not None: 255 | return tp, (y_pred * sample_weight).sum(axis=0) 256 | return tp, y_pred.sum(axis=0) 257 | 258 | if sample_weight is not None: 259 | tot = (y_true * sample_weight).sum(axis=0) 260 | else: 261 | tot = y_true.sum(axis=0) 262 | if mode == 'r': 263 | return tp, tot 264 | 265 | if sample_weight is not None: 266 | tot_p = (y_pred * sample_weight).sum(axis=0) 267 | else: 268 | tot_p = y_pred.sum(axis=0) 269 | 270 | return tp, tot, tot_p 271 | 272 | def compare(self, v0, v1): 273 | return v0 > v1 274 | 275 | 276 | class AccuracyMetric(ThresholdMetrics): 277 | """Accuracy Metric for the binary/multilabel classification task""" 278 | alias = 'Accuracy' 279 | 280 | def error(self, y_true, y_pred): 281 | y_pred = self.get_label(y_pred) 282 | return (y_true == y_pred).mean(axis=1) 283 | 284 | 285 | class Precision(ThresholdMetrics): 286 | """Precision Metric for the binary/multilabel classification task""" 287 | alias = 'Precision' 288 | 289 | def __call__(self, y_true, y_pred, sample_weight=None): 290 | tp, tot = self.get_stats(y_true, y_pred, sample_weight, mode='p') 291 | tot = cp.clip(tot, 1e-5, None) 292 | return (tp / tot).mean() 293 | 294 | 295 | class Recall(ThresholdMetrics): 296 | """Recall Metric for the binary/multilabel classification task""" 297 | alias = 'Recall' 298 | 299 | def __call__(self, y_true, y_pred, sample_weight=None): 300 | tp, tot = self.get_stats(y_true, y_pred, sample_weight, mode='r') 301 | tot = cp.clip(tot, 1e-5, None) 302 | return (tp / tot).mean() 303 | 304 | 305 | class F1Score(ThresholdMetrics): 306 | """F1 Score Metric for the binary/multilabel classification task""" 307 | alias = 'F1_score' 308 | 309 | def __call__(self, y_true, y_pred, sample_weight=None): 310 | tp, tot, tot_p = self.get_stats(y_true, y_pred, sample_weight, mode='f1') 311 | precision = tp / cp.clip(tot_p, 1e-5, None) 312 | recall = tp / cp.clip(tot, 1e-5, None) 313 | 314 | return (2 * (precision * recall) / cp.clip(precision + recall, 1e-5, None)).mean() 315 | 316 | 317 | metric_alias = { 318 | 319 | # for bce 320 | 'bce': BCEMetric(), 321 | 'logloss': BCEMetric(), 322 | 323 | 'precision': Precision(), 324 | 'recall': Recall(), 325 | 'f1_score': F1Score(), 326 | 'f1': F1Score(), 327 | 328 | 'accuracy': AccuracyMetric(), 329 | 'acc': AccuracyMetric(), 330 | 331 | 'auc': RocAucMetric(), 332 | 'roc': RocAucMetric(), 333 | 334 | # for regression 335 | 'rmse': RMSEMetric(), 336 | 'l2': RMSEMetric(), 337 | 'rmsle': RMSLEMetric(), 338 | 'r2': R2Score(), 339 | 'r2_score': R2Score(), 340 | 341 | } 342 | -------------------------------------------------------------------------------- /py_boost/gpu/losses/multiclass_metrics.py: -------------------------------------------------------------------------------- 1 | """Common multiclass metrics""" 2 | 3 | try: 4 | import cupy as cp 5 | except Exception: 6 | pass 7 | 8 | from .metrics import Metric, metric_alias 9 | 10 | 11 | class CrossEntropyMetric(Metric): 12 | """CrossEntropy Metric for the multiclassification task""" 13 | alias = 'Crossentropy' 14 | 15 | def error(self, y_true, y_pred): 16 | return -cp.log(cp.take_along_axis(y_pred, y_true[:, cp.newaxis], axis=1)) 17 | 18 | def compare(self, v0, v1): 19 | return v0 < v1 20 | 21 | 22 | class MultiAccuracyMetric(Metric): 23 | """Accuracy Metric for the multiclassification task""" 24 | alias = 'Accuracy' 25 | 26 | def error(self, y_true, y_pred): 27 | cl_pred = y_pred.argmax(axis=1) 28 | return (cl_pred == y_true).astype(cp.float32) 29 | 30 | def compare(self, v0, v1): 31 | return v0 > v1 32 | 33 | 34 | class MultiMetric(Metric): 35 | """Basic class to handle metrics, that accept class label prediction as input for the multiclassificationn task""" 36 | 37 | def __init__(self, average='macro'): 38 | """ 39 | 40 | Args: 41 | average: str, one of 'micro' / 'macro' 42 | """ 43 | self.average = average 44 | 45 | @staticmethod 46 | def get_stats(y_true, y_pred, sample_weight=None, mode='f1'): 47 | """Helpful utils to calc Precision/Recall/F1 48 | 49 | Args: 50 | y_true: cp.ndarray, target 51 | y_pred: cp.ndarray, predicted class label 52 | sample_weight: None or cp.ndarray, weights 53 | mode: 54 | 55 | Returns: 56 | 57 | """ 58 | 59 | if sample_weight is None: 60 | sample_weight = cp.ones(y_true.shape, dtype=cp.float32) 61 | else: 62 | sample_weight = sample_weight[:, 0] 63 | 64 | cl_pred = y_pred.argmax(axis=1) 65 | true = y_true == cl_pred 66 | 67 | tp = cp.zeros(y_pred.shape[1], dtype=cp.float64) 68 | tp.scatter_add(cl_pred, true * sample_weight) 69 | 70 | tot = cp.zeros(y_pred.shape[1], dtype=cp.float64) 71 | if mode == 'p': 72 | tot.scatter_add(cl_pred, sample_weight) 73 | return tp, tot 74 | 75 | tot.scatter_add(y_true, sample_weight) 76 | if mode == 'r': 77 | return tp, tot 78 | 79 | tot_p = cp.zeros(y_pred.shape[1], dtype=cp.float64) 80 | tot_p.scatter_add(cl_pred, sample_weight) 81 | 82 | return tp, tot, tot_p 83 | 84 | def get_metric(self, tp, tot): 85 | 86 | tot = cp.clip(tot, 1e-5, None) 87 | 88 | if self.average == 'micro': 89 | return float(tp.sum() / tot.sum()) 90 | 91 | return float((tp / tot).mean()) 92 | 93 | def compare(self, v0, v1): 94 | return v0 > v1 95 | 96 | 97 | class MultiPrecision(MultiMetric): 98 | """Precision Metric for the multiclassification classification task""" 99 | alias = 'Precision' 100 | 101 | def __call__(self, y_true, y_pred, sample_weight=None): 102 | tp, tot = self.get_stats(y_true, y_pred, sample_weight=sample_weight, mode='p') 103 | return self.get_metric(tp, tot) 104 | 105 | 106 | class MultiRecall(MultiMetric): 107 | """Recall Metric for the multiclassification classification task""" 108 | alias = 'Recall' 109 | 110 | def __call__(self, y_true, y_pred, sample_weight=None): 111 | tp, tot = self.get_stats(y_true, y_pred, sample_weight=sample_weight, mode='r') 112 | return self.get_metric(tp, tot) 113 | 114 | 115 | class MultiF1Score(MultiMetric): 116 | """F1 Score Metric for the multiclassification classification task""" 117 | alias = 'F1_score' 118 | 119 | def __call__(self, y_true, y_pred, sample_weight=None): 120 | tp, tot, tot_p = self.get_stats(y_true, y_pred, sample_weight=sample_weight, mode='f1') 121 | precision = self.get_metric(tp, tot_p) 122 | recall = self.get_metric(tp, tot) 123 | return 2 * (precision * recall) / (precision + recall) 124 | 125 | 126 | multiclass_metric_alias = {**metric_alias, **{ 127 | 128 | 'crossentropy': CrossEntropyMetric(), 129 | 130 | 'precision': MultiPrecision(), 131 | 'recall': MultiRecall(), 132 | 'f1_score': MultiF1Score(), 133 | 'f1': MultiF1Score(), 134 | 135 | 'accuracy': MultiAccuracyMetric(), 136 | 'acc': MultiAccuracyMetric(), 137 | 138 | }} 139 | -------------------------------------------------------------------------------- /py_boost/gpu/serialization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tqdm 3 | import ujson as json 4 | 5 | from .tree import Tree 6 | 7 | 8 | def nested_float_cast(arr): 9 | if type(arr[0]) is list: 10 | for i in range(len(arr)): 11 | arr[i] = nested_float_cast(arr[i]) 12 | 13 | else: 14 | for i in range(len(arr)): 15 | arr[i] = float(arr[i]) 16 | 17 | return arr 18 | 19 | 20 | def handle_float(arr): 21 | arr = arr.astype(str) 22 | arr = arr.tolist() 23 | arr = nested_float_cast(arr) 24 | 25 | return arr 26 | 27 | 28 | def parse_tree(tree): 29 | """Parse single tree 30 | 31 | Args: 32 | tree: Py-Boost Tree 33 | 34 | Returns: 35 | dict 36 | """ 37 | D = {} 38 | 39 | for key in tree.__dict__: 40 | 41 | value = tree.__dict__[key] 42 | if value is None: 43 | continue 44 | 45 | if type(value) is np.ndarray: 46 | 47 | if np.issubdtype(value.dtype, np.floating): 48 | value = handle_float(value) 49 | else: 50 | value = value.tolist() 51 | 52 | D[key] = value 53 | 54 | return D 55 | 56 | 57 | def parse_model(model): 58 | """Parse model 59 | 60 | Args: 61 | model: Py-Boost Ensemble 62 | 63 | Returns: 64 | dict 65 | """ 66 | model.to_cpu() 67 | 68 | D = {'base_score': handle_float(model.base_score)} 69 | 70 | for n, tree in enumerate(tqdm.tqdm(model.models)): 71 | D[n] = parse_tree(tree) 72 | 73 | return D 74 | 75 | 76 | def dump(model, file): 77 | """Parse model and save the results 78 | 79 | Args: 80 | model: Py-Boost Ensemble 81 | file: str, path to file 82 | 83 | Returns: 84 | 85 | """ 86 | with open(file, 'w') as f: 87 | json.dump(parse_model(model), f) 88 | 89 | return 90 | 91 | 92 | attr_types = { 93 | 94 | 'values': np.float32, 95 | 'group_index': np.uint64, 96 | 'feature_importance_gain': np.float32, 97 | 'feature_importance_split': np.float32, 98 | 'test_format': np.float32, 99 | 'test_format_offsets': np.int32 100 | 101 | } 102 | 103 | 104 | def load_tree(D): 105 | """Create single tree from dict 106 | 107 | Args: 108 | D: dict 109 | 110 | Returns: 111 | Py-Boost Tree 112 | """ 113 | tree = Tree(1, 1, 1) 114 | # delete unused attrs 115 | for key in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'leaves']: 116 | setattr(tree, key, None) 117 | 118 | # set new attrs 119 | for key in D: 120 | value = D[key] 121 | 122 | if type(value) is list: 123 | value = np.asarray(value) 124 | 125 | if key in attr_types: 126 | value = value.astype(attr_types[key]) 127 | 128 | setattr(tree, key, value) 129 | 130 | return tree 131 | 132 | 133 | def load_model(D, model): 134 | """Update model data with dict values 135 | 136 | Args: 137 | D: dict 138 | model: Py-Boost Ensemble 139 | 140 | Returns: 141 | Py-Boost Ensemble 142 | """ 143 | model.base_score = np.asarray(D.pop('base_score')).astype(np.float32) 144 | 145 | trees = [None] * len(D) 146 | 147 | for key in D: 148 | trees[int(key)] = load_tree(D[key]) 149 | 150 | model.models = trees 151 | 152 | return model 153 | 154 | 155 | def load(model, file): 156 | """Read data from json and update Py-Boost model data 157 | 158 | Args: 159 | model: Py-Boost Ensemble 160 | file: str, file path 161 | 162 | Returns: 163 | Py-Boost Ensemble 164 | """ 165 | with open(file, 'r') as f: 166 | load_model(json.load(f), model) 167 | 168 | return model 169 | -------------------------------------------------------------------------------- /py_boost/gpu/sketch_boost.py: -------------------------------------------------------------------------------- 1 | """Implements SketchBoost for multioutput class""" 2 | 3 | from .boosting import GradientBoosting 4 | from ..multioutput.sketching import FilterSketch, TopOutputsSketch, SVDSketch, RandomSamplingSketch, \ 5 | RandomProjectionSketch 6 | 7 | 8 | class SketchBoost(GradientBoosting): 9 | """ 10 | Gradient Boosting with built-in FilterSketch to handle multioutput tasks. If single output is passed, 11 | it is handled as usual 12 | """ 13 | 14 | def __init__(self, loss, 15 | metric=None, 16 | ntrees=100, 17 | lr=0.05, 18 | min_gain_to_split=0, 19 | lambda_l2=1, 20 | gd_steps=1, 21 | max_depth=6, 22 | min_data_in_leaf=10, 23 | colsample=1., 24 | subsample=1., 25 | 26 | quantization='Quantile', 27 | quant_sample=2000000, 28 | max_bin=256, 29 | min_data_in_bin=3, 30 | 31 | es=100, 32 | seed=42, 33 | verbose=10, 34 | 35 | sketch_outputs=1, 36 | sketch_method='proj', 37 | use_hess=False, 38 | 39 | callbacks=None, 40 | sketch_params=None 41 | ): 42 | """ 43 | 44 | Args: 45 | loss: str or Loss, loss function 46 | metric: None or str or Metric, metric 47 | ntrees: int, maximum number of trees 48 | lr: float, learning rate 49 | min_gain_to_split: float >=0, minimal gain to split 50 | lambda_l2: float > 0, l2 leaf regularization 51 | gd_steps: int > 0, number of gradient steps 52 | max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets 53 | min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated 54 | with hessian values to speed up training 55 | colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling 56 | subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling 57 | 58 | quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform', 59 | 'Uniquant' or custom implementation 60 | quant_sample: int, subsample to quantize features 61 | max_bin: int in [2, 256] maximum number of bins to quantize features 62 | min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored 63 | 64 | es: int, early stopping rounds. If 0, no early stopping 65 | seed: int, random state 66 | verbose: int, verbosity freq 67 | sketch_outputs: int, number of outputs to keep 68 | sketch_method: str, name of the sketching strategy 69 | use_hess: bool, use hessians in multioutput training 70 | callbacks: list of Callback, callbacks to customize training are passed here 71 | sketch_params: dict, optional kwargs for sketching strategy 72 | """ 73 | if sketch_params is None: 74 | sketch_params = {} 75 | 76 | if sketch_method == 'filter': 77 | sketch = FilterSketch(sketch_outputs, **sketch_params) 78 | 79 | elif sketch_method == 'svd': 80 | sketch = SVDSketch(sketch_outputs, **sketch_params) 81 | 82 | elif sketch_method == 'topk': 83 | sketch = TopOutputsSketch(sketch_outputs) 84 | 85 | elif sketch_method == 'rand': 86 | sketch = RandomSamplingSketch(sketch_outputs, **sketch_params) 87 | 88 | elif sketch_method == 'proj': 89 | sketch = RandomProjectionSketch(sketch_outputs, **sketch_params) 90 | 91 | elif sketch_method is None: 92 | sketch = None 93 | 94 | else: 95 | raise ValueError('Unknown sketching strategy') 96 | 97 | super().__init__(loss=loss, 98 | metric=metric, 99 | ntrees=ntrees, 100 | lr=lr, 101 | min_gain_to_split=min_gain_to_split, 102 | lambda_l2=lambda_l2, 103 | gd_steps=gd_steps, 104 | max_depth=max_depth, 105 | min_data_in_leaf=min_data_in_leaf, 106 | colsample=colsample, 107 | subsample=subsample, 108 | 109 | quantization=quantization, 110 | quant_sample=quant_sample, 111 | max_bin=max_bin, 112 | min_data_in_bin=min_data_in_bin, 113 | 114 | target_splitter='Single', 115 | multioutput_sketch=sketch, 116 | use_hess=use_hess, 117 | es=es, 118 | seed=seed, 119 | verbose=verbose, 120 | callbacks=callbacks) 121 | -------------------------------------------------------------------------------- /py_boost/gpu/tree.py: -------------------------------------------------------------------------------- 1 | """Decision trees building and inference""" 2 | 3 | try: 4 | import cupy as cp 5 | except Exception: 6 | pass 7 | import numpy as np 8 | 9 | from .utils import apply_values, depthwise_grow_tree, get_tree_node, set_leaf_values, calc_node_values 10 | from .utils import tree_prediction_leaves_typed_kernels, tree_prediction_leaves_typed_kernels_f 11 | from .utils import tree_prediction_values_kernel 12 | 13 | 14 | class Tree: 15 | """This class initializes an empty tree structure, implements methods to set tree values and single tree inference. 16 | The instance of this object represents the actual boosting step, but not the single tree! 17 | Actual amount of trees in the instance (at each boosting step) is defined by ngroups argument. What it means: 18 | Assume you have 5 class classification task, so you model output size equals 5. Possible cases here: 19 | - Build single decision tree that outputs a vector of 5 values. In this case ngroups eq. 1 20 | - Build 5 decision trees, each tree predict a value for its own class (one-vs-all). 21 | In this case ngroups eq. 5 22 | - Create custom target split strategy. For ex. you can build 2 trees, first will predict [0, 1, 2] classes, 23 | second - [3, 4]. In this case ngroups eq. 2 24 | 25 | Grouped trees structure is defined by arrays: 26 | feats, shape (ngroups, max_nodes) - feature index to use for the split in each group/node. 27 | If -1, the node is terminal (leaf) 28 | val_splits, shape (ngroups, max_nodes) - threshold to compare when choosing the next node 29 | if feature value is not NaN 30 | nans, shape (ngroups, max_nodes) - bool values, if True, NaN feature values objects moves left, else - right 31 | split, shape (ngroups, max_nodes, 2) - node indices corresponding left/right split for the current node 32 | 33 | Trees structure defines single node id value for each object 34 | Values assigned to the outputs are defined by arrays: 35 | group_index, shape (nout, ). Defines the group id for predicting each output 36 | values, shape (max_nodes, nout). Define output value for each node/output 37 | leaves, shape (max_leaves, ngroups). Assigns the leaf index to the terminal nodes 38 | 39 | During the fit stage, the format described above is used. 40 | After fitting, additional reformatting occurs that converts the tree to another format to achieve faster inference: 41 | - Sub-trees for each group are stored in one array named "test_format": 42 | [gr0_node0, ..., gr0_nodeN, gr1_node0, ..., gr1_nodeM, gr2_node0, ..., gr2_nodeK, gr3_node0, ...] 43 | - Each node in new formatted tree consists of 4 fields: 44 | [feature_index, split_value, left_node_index, right_node_index], 45 | feature_index - feature index to use for the split in each node. 46 | split_value - threshold to compare when choosing the next node if feature value is not NaN 47 | left_node_index - index of the left child in "test_format" array 48 | right_node_index - index of the right child in "test_format" array 49 | - The size of "test_format" array equals to the sum of all nodes in all subtrees except leaves multiplied by 4. 50 | Multiplication by 4 occurs because each node consists of the 4 fields described above. 51 | Examples: 52 | test_format[0 * 4] == test_format[0] - yields feature_index for node with index 0. 53 | test_format[0 * 4 + 1] == test_format[1] - yields split_value for node with index 0. 54 | test_format[0 * 4 + 2] == test_format[2] - yields left_node_index for node with index 0. 55 | test_format[0 * 4 + 3] == test_format[3] - yields right_node_index for node with index 0. 56 | test_format[1 * 4] == test_format[4] - yields feature_index for node with index 1. 57 | test_format[1 * 4 + 1] == test_format[5] - yields split_value for node with index 1. 58 | test_format[1 * 4 + 2] == test_format[6] - yields left_node_index for node with index 1. 59 | test_format[1 * 4 + 3] == test_format[7] - yields right_node_index for node with index 1. 60 | test_format[2 * 4] == test_format[8] - yields feature_index for node with index 2. 61 | ... 62 | test_format[79 * 4] == test_format[316] - yields feature_index for node with index 79. 63 | test_format[79 * 4 + 1] == test_format[317] - yields split_value for node with index 79. 64 | test_format[79 * 4 + 2] == test_format[318] - yields left_node_index for node with index 79. 65 | test_format[79 * 4 + 3] == test_format[319] - yields right_node_index for node with index 79. 66 | ... 67 | - The sign of the feature_index value shows the behavior in case of feature == NaN (split to the left/right), 68 | to the value written in feature_index an extra "1" is added to deal with zero. 69 | Examples: 70 | feature_index == 8, positive value means that tree follows to the left in case of NaN in feature, 71 | the real feature index is calculated as follows: abs(8) - 1 = 7. 72 | feature_index == -19, negative value means that tree follows to the right in case of NaN in feature, 73 | the real feature index is calculated as follows: abs(-19) - 1 = 18. 74 | feature_index == 0, impossible due to construction algorithm. 75 | - If left_node_index/right_node_index is negative, it means that it shows index in the values array; 76 | In case of a negative value, an extra "1" is added to deal with zero. 77 | Examples: 78 | left_node_index == 8, non-negative value means that left child node is stored in "test_format" with index 8; 79 | left_node_index == -13, means that left child is a leaf, the index in "values" array for that leaf can 80 | be calculated as follows: abs(-13) - 1 = 12. Thus, index in "values" array is 12. 81 | - All subtrees are stored in one array, so an additional array of indexes where each subtree is starting 82 | is required (index of the subtree roots), array "gr_subtree_offsets" stores these indexes, 83 | size of "gr_subtree_offsets" equals to number of groups in the tree (number of subtrees). 84 | Example: 85 | gr_subtree_offsets == [0, 56, 183], means that tree has 3 subtrees (3 groups). 86 | The first subtree has its root as node with index 0; 87 | The second subtree has its root as node with index 56; 88 | The third subtree has its root as node with index 183. 89 | Example how to access the values of the root node in the second subtree: 90 | test_format[56 * 4] == test_format[224] - yields feature_index for the root of the second subtree; 91 | test_format[56 * 4 + 1] == test_format[225] - yields split_value for the root of the second subtree; 92 | test_format[56 * 4 + 2] == test_format[226] - yields left_node_index for the root of the second subtree; 93 | test_format[56 * 4 + 3] == test_format[227] - yields right_node_index for the root of the second subtree 94 | - Two fields, 'feature_importance_gain' and 'feature_importance_split', store feature importance arrays 95 | and describe the fitted tree accordingly. 96 | """ 97 | 98 | def __init__(self, max_nodes, nout, ngroups): 99 | """Initialize empty tree 100 | 101 | Args: 102 | max_nodes: int, maximum number of nodes in tree 103 | nout: int, number of outputs in tree 104 | ngroups: int, number of groups 105 | """ 106 | self.nout = nout 107 | self.ngroups = ngroups 108 | self.max_nodes = max_nodes 109 | 110 | self.gains = np.zeros((ngroups, max_nodes,), dtype=np.float32) 111 | self.feats = np.zeros((ngroups, max_nodes,), dtype=np.int64) - 1 112 | self.bin_splits = np.zeros((ngroups, max_nodes,), dtype=np.int32) 113 | self.nans = np.zeros((ngroups, max_nodes,), dtype=np.bool_) 114 | 115 | self.split = np.zeros((ngroups, max_nodes, 2), dtype=np.int32) 116 | 117 | self.val_splits = None 118 | self.values = None 119 | self.group_index = None 120 | self.leaves = None 121 | self.max_leaves = None 122 | 123 | self.feature_importance_gain = None 124 | self.feature_importance_split = None 125 | 126 | self._debug = None 127 | self.test_format = None 128 | self.test_format_offsets = None 129 | 130 | def set_nodes(self, group, unique_nodes, new_nodes_id, best_feat, best_gain, best_split, best_nan_left): 131 | """Write info about new nodes 132 | 133 | Args: 134 | group: int, group id to write 135 | unique_nodes: np.ndarray, nodes id to set info 136 | new_nodes_id: np.ndarray, nodes id to left/right split current node 137 | best_feat: np.ndarray, feature value to perform a split 138 | best_gain: np.ndarray, gain from the split 139 | best_split: np.ndarray, quantized threshold to compare when split 140 | best_nan_left: np.ndarray, bool if True, nans moved in the left node, else right 141 | 142 | Returns: 143 | 144 | """ 145 | 146 | self.gains[group, unique_nodes] = best_gain 147 | self.feats[group, unique_nodes] = best_feat 148 | self.bin_splits[group, unique_nodes] = best_split 149 | self.nans[group, unique_nodes] = best_nan_left 150 | self.split[group, unique_nodes] = new_nodes_id 151 | 152 | def set_node_values(self, values, group_index): 153 | """Assign output values for each nodes 154 | 155 | Args: 156 | values: np.ndarray, node values 157 | group_index: np.ndarray, group id of each output 158 | 159 | Returns: 160 | 161 | """ 162 | self.values = values 163 | self.group_index = group_index 164 | 165 | def set_borders(self, borders): 166 | """Assign actual feature values based on quantized 167 | 168 | Args: 169 | borders: list of np.ndarray, actual node values 170 | 171 | Returns: 172 | 173 | """ 174 | # borders - list of arrays. Array is borderlines 175 | val_splits = [0 if x == -1 else borders[x][min(y, len(borders[x]) - 1)] 176 | for (x, y) in zip(self.feats.ravel(), self.bin_splits.ravel())] 177 | self.val_splits = np.array(val_splits, dtype=np.float32).reshape(self.feats.shape) 178 | 179 | def set_leaves(self): 180 | """Assign leaf id to the terminal nodes 181 | 182 | Returns: 183 | 184 | """ 185 | self.leaves, self.max_leaves = set_leaf_values(self.feats, self.split) 186 | 187 | def to_device(self): 188 | """Move tree data to the current GPU memory 189 | 190 | Returns: 191 | 192 | """ 193 | for attr in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'values', 'group_index', 'leaves', 194 | 'test_format', 'test_format_offsets', 'feature_importance_gain', 'feature_importance_split']: 195 | arr = getattr(self, attr) 196 | 197 | if type(arr) is np.ndarray: 198 | setattr(self, attr, cp.asarray(arr)) 199 | 200 | def to_cpu(self): 201 | """Move tree data to the CPU memory 202 | 203 | Returns: 204 | 205 | """ 206 | for attr in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'values', 'group_index', 'leaves', 207 | 'test_format', 'test_format_offsets', 'feature_importance_gain', 'feature_importance_split']: 208 | arr = getattr(self, attr) 209 | 210 | if type(arr) is cp.ndarray: 211 | setattr(self, attr, arr.get()) 212 | 213 | def _predict_node_deprecated(self, X): 214 | """(DEPRECATED) Predict node id from the feature matrix X 215 | 216 | Args: 217 | X: cp.ndarray of features 218 | 219 | Returns: 220 | 221 | """ 222 | if self.feats is None: 223 | raise Exception('To use _deprecated funcs pass debug=True to .reformat') 224 | 225 | assert type(self.feats) is cp.ndarray, 'Should be moved to GPU first. Call .to_device()' 226 | nodes = get_tree_node(X, self.feats, self.val_splits, self.split, self.nans) 227 | return nodes 228 | 229 | def _predict_from_nodes_deprecated(self, nodes): 230 | """(DEPRECATED) Predict outputs from the nodes indices 231 | 232 | Args: 233 | nodes: cp.ndarray of predicted nodes 234 | 235 | Returns: 236 | cp.ndarray of nodes 237 | """ 238 | return apply_values(nodes, self.group_index, self.values) 239 | 240 | def _predict_leaf_from_nodes_deprecated(self, nodes): 241 | """Predict leaf indices from the nodes indices (Use predict_leaf() method if you need to predict leaves) 242 | 243 | Args: 244 | nodes: cp.ndarray of predicted nodes 245 | 246 | Returns: 247 | cp.ndarray of leaves 248 | """ 249 | return apply_values(nodes, cp.arange(self.ngroups, dtype=cp.uint64), self.leaves) 250 | 251 | def _predict_deprecated(self, X): 252 | """(DEPRECATED) Predict from the feature matrix X 253 | 254 | Args: 255 | X: cp.ndarray of features 256 | 257 | Returns: 258 | cp.ndarray of predictions 259 | """ 260 | return self._predict_from_nodes_deprecated( 261 | self._predict_leaf_from_nodes_deprecated(self._predict_node_deprecated(X))) 262 | 263 | def _predict_leaf_deprecated(self, X): 264 | """(DEPRECATED) Predict leaf indices from the feature matrix X 265 | 266 | Args: 267 | X: cp.ndarray of features 268 | 269 | Returns: 270 | cp.ndarray of leaves 271 | """ 272 | return self._predict_leaf_from_nodes_deprecated(self._predict_node_deprecated(X)) 273 | 274 | def predict_leaf(self, X, pred_leaves=None): 275 | """Predict leaf indexes from the feature matrix X 276 | 277 | Args: 278 | X: cp.ndarray, array of features 279 | pred_leaves: cp.ndarray, buffer for predictions 280 | 281 | Returns: 282 | pred_leaves: leaf predictions 283 | 284 | """ 285 | # check if buffer is None and X on GPU 286 | assert type(X) is cp.ndarray, "X must be type of cp.ndarray (located on gpu)" 287 | 288 | dt = str(X.dtype) 289 | 290 | assert dt in tree_prediction_leaves_typed_kernels, \ 291 | f"X array must be of type: {list(tree_prediction_leaves_typed_kernels.keys())}" 292 | 293 | if pred_leaves is None: 294 | pred_leaves = cp.empty((X.shape[0], self.ngroups), dtype=cp.int32) 295 | 296 | # CUDA parameters initialization 297 | threads = 128 # threads in one CUDA block 298 | sz = X.shape[0] * self.ngroups 299 | blocks = sz // threads 300 | if sz % threads != 0: 301 | blocks += 1 302 | 303 | if X.flags["C_CONTIGUOUS"]: 304 | tree_prediction_leaves_typed_kernels[dt]((blocks,), (threads,), ((X, 305 | self.test_format, 306 | self.test_format_offsets, 307 | X.shape[1], 308 | X.shape[0], 309 | self.ngroups, 310 | pred_leaves.shape[1], 311 | pred_leaves))) 312 | elif X.flags["F_CONTIGUOUS"]: 313 | tree_prediction_leaves_typed_kernels_f[dt]((blocks,), (threads,), ((X, 314 | self.test_format, 315 | self.test_format_offsets, 316 | X.shape[1], 317 | X.shape[0], 318 | self.ngroups, 319 | pred_leaves.shape[1], 320 | pred_leaves))) 321 | else: 322 | raise Exception("X must be 'C_CONTIGUOUS' or 'F_CONTIGUOUS'") 323 | return pred_leaves 324 | 325 | def predict(self, X, pred=None, pred_leaves=None): 326 | """Predict from the feature matrix X 327 | 328 | Args: 329 | X: cp.ndarray, array of features 330 | pred: cp.ndarray, buffer for predictions on GPU, if None - created automatically 331 | pred_leaves: cp.ndarray, buffer for internal leaf predictions on GPU, if None - created automatically 332 | 333 | Returns: 334 | pred: cp.ndarray, prediction array 335 | 336 | """ 337 | # check if buffers are None 338 | if pred is None: 339 | pred = cp.zeros((X.shape[0], self.nout), dtype=cp.float32) 340 | if pred_leaves is None: 341 | pred_leaves = cp.empty((X.shape[0], self.ngroups), dtype=cp.int32) 342 | 343 | # first step - leaves predictions, actually prediction of indexes in values 344 | self.predict_leaf(X, pred_leaves) 345 | 346 | # CUDA parameters initialization 347 | threads = 128 # threads in one CUDA block 348 | sz = X.shape[0] * self.nout 349 | blocks = sz // threads 350 | if sz % threads != 0: 351 | blocks += 1 352 | 353 | # second step, prediction of actual values 354 | tree_prediction_values_kernel((blocks,), (threads,), ((pred_leaves, 355 | self.group_index, 356 | self.values, 357 | self.nout, 358 | X.shape[0], 359 | pred_leaves.shape[1], 360 | pred))) 361 | return pred 362 | 363 | def reformat(self, nfeats, debug): 364 | """Creates new internal format of the tree for faster inference 365 | 366 | Args: 367 | nfeats: int, number of features in X (train set) 368 | debug: bool, if in debug mode 369 | 370 | Returns: 371 | 372 | """ 373 | n_gr = self.ngroups 374 | 375 | # memory allocation for new tree array 376 | gr_subtree_offsets = np.zeros(n_gr, dtype=np.int32) 377 | check_empty = [] 378 | total_size = 0 379 | for i in range(n_gr): 380 | curr_size = int((self.feats[i] >= 0).sum()) 381 | # add special case handling - single leaf, no splits 382 | check_empty.append(curr_size == 0) 383 | curr_size = max(1, curr_size) 384 | total_size += curr_size 385 | 386 | if i < n_gr - 1: 387 | gr_subtree_offsets[i + 1] = total_size 388 | nf = np.zeros(total_size * 4, dtype=np.float32) 389 | 390 | # reformatting the tree 391 | for i in range(n_gr): 392 | # handle special case - single leaf, no splits - make a pseudo split node 393 | if check_empty[i]: 394 | nf[4 * gr_subtree_offsets[i]] = 1. 395 | nf[4 * gr_subtree_offsets[i] + 1] = 0. 396 | nf[4 * gr_subtree_offsets[i] + 2] = -1. 397 | nf[4 * gr_subtree_offsets[i] + 3] = -1. 398 | 399 | continue 400 | 401 | q = [(0, 0)] 402 | 403 | while len(q) != 0: # BFS in tree 404 | n_old, n_new = q[0] 405 | if not self.nans[i][n_old]: 406 | nf[4 * (gr_subtree_offsets[i] + n_new)] = float(self.feats[i][n_old] + 1) 407 | else: 408 | nf[4 * (gr_subtree_offsets[i] + n_new)] = float(-(self.feats[i][n_old] + 1)) 409 | nf[4 * (gr_subtree_offsets[i] + n_new) + 1] = float(self.val_splits[i][n_old]) 410 | ln = self.split[i][n_old][0] 411 | rn = self.split[i][n_old][1] 412 | 413 | if self.feats[i][ln] < 0: 414 | nf[4 * (gr_subtree_offsets[i] + n_new) + 2] = float(-(self.leaves[ln][i] + 1)) 415 | else: 416 | new_node_number = q[-1][1] + 1 417 | nf[4 * (gr_subtree_offsets[i] + n_new) + 2] = float(new_node_number) 418 | q.append((ln, new_node_number)) 419 | 420 | if self.feats[i][rn] < 0: 421 | nf[4 * (gr_subtree_offsets[i] + n_new) + 3] = float(-(self.leaves[rn][i] + 1)) 422 | else: 423 | new_node_number = q[-1][1] + 1 424 | nf[4 * (gr_subtree_offsets[i] + n_new) + 3] = float(new_node_number) 425 | q.append((rn, new_node_number)) 426 | q.pop(0) 427 | 428 | self.test_format = nf 429 | self.test_format_offsets = gr_subtree_offsets 430 | 431 | # feature_ importance with gain 432 | self.feature_importance_gain = np.zeros(nfeats, dtype=np.float32) 433 | sl = self.feats >= 0 434 | np.add.at(self.feature_importance_gain, self.feats[sl], self.gains[sl]) 435 | 436 | # feature_ importance with split 437 | self.feature_importance_split = np.zeros(nfeats, dtype=np.float32) 438 | sl = self.feats >= 0 439 | np.add.at(self.feature_importance_split, self.feats[sl], 1) 440 | 441 | if not debug: 442 | for attr in ['gains', 'feats', 'bin_splits', 'nans', 'split', 'val_splits', 'leaves']: 443 | setattr(self, attr, None) 444 | 445 | 446 | class DepthwiseTreeBuilder: 447 | """This class builds decision tree with given parameters""" 448 | 449 | def __init__(self, borders, 450 | use_hess=True, 451 | colsampler=None, 452 | subsampler=None, 453 | target_splitter=None, 454 | multioutput_sketch=None, 455 | gd_steps=1, 456 | **tree_params 457 | ): 458 | """ 459 | 460 | Args: 461 | borders: list of np.ndarray, actual split borders for quantized features 462 | colsampler: Callable or None, column sampling strategy 463 | subsampler: Callable or None, rows sampling strategy 464 | target_splitter: Callable or None, target grouping strategy 465 | multioutput_sketch: Callable or None, multioutput sketching strategy 466 | **tree_params: other tree building parameters 467 | """ 468 | self.borders = borders 469 | self.use_hess = use_hess 470 | self.params = {**{ 471 | 472 | 'lr': 1., 473 | 'lambda_l2': .01, 474 | 'max_bin': 256, 475 | 'max_depth': 6, 476 | 'min_data_in_leaf': 10, 477 | 'min_gain_to_split': 0 478 | }, **tree_params} 479 | 480 | self.colsampler = colsampler 481 | self.subsampler = subsampler 482 | self.target_grouper = target_splitter 483 | self.multioutput_sketch = multioutput_sketch 484 | self.gd_steps = gd_steps 485 | 486 | def build_tree(self, X, grad, hess, sample_weight=None, grad_fn=None, *val_arrays): 487 | """Build tree and return nodes/values predictions for train and validation sets 488 | 489 | Args: 490 | X: cp.ndarray, quantized feature matrix 491 | grad: cp.ndarray, gradient matrix 492 | hess: cp.ndarray, hessian matrix 493 | sample_weight: cp.ndarray or None, sample's weights 494 | grad_fn: gradient fn 495 | *val_arrays: list of cp.ndarray, list of quantized features for validation sets 496 | 497 | Returns: 498 | tree, Tree, constructed tree 499 | nodes_group, cp.ndarray, nodes id for the train set 500 | pred, cp.ndarray, prediction for the train set 501 | valid_nodes_group, list of cp.ndarray, list of predicted nodes for valid sets 502 | val_preds, list of cp.ndarray, list of predictions for valid sets 503 | """ 504 | if self.colsampler is None: 505 | col_indexer = cp.arange(X.shape[1], dtype=cp.uint64) 506 | else: 507 | col_indexer = self.colsampler() 508 | 509 | if self.subsampler is None: 510 | row_indexer = cp.arange(X.shape[0], dtype=cp.uint64) 511 | else: 512 | row_indexer = self.subsampler() 513 | 514 | if self.target_grouper is None: 515 | output_groups = [cp.arange(grad.shape[1], dtype=cp.uint64)] 516 | else: 517 | output_groups = self.target_grouper() 518 | 519 | if sample_weight is not None: 520 | grad = grad * sample_weight 521 | hess = hess * sample_weight 522 | 523 | max_nodes = int((2 ** np.arange(self.params['max_depth'] + 1)).sum()) 524 | tree = Tree(max_nodes, grad.shape[1], len(output_groups)) 525 | 526 | nodes_group = cp.empty((grad.shape[0], len(output_groups)), dtype=cp.int32) 527 | valid_nodes_group = [cp.empty((x.shape[0], len(output_groups)), dtype=cp.int32) for x in val_arrays] 528 | 529 | group_index = cp.zeros(grad.shape[1], dtype=cp.uint64) 530 | 531 | for n_grp, grp_indexer in enumerate(output_groups): 532 | G = grad[:, grp_indexer] 533 | # if output group len eq. 1, we have single output tree, use hess for structure search 534 | if G.shape[1] == 1: 535 | H = hess if hess.shape[1] == 1 else hess[:, grp_indexer] 536 | # else we can decide: should we use hess for tree structure search or 537 | # assume hess eq. sample weight for all outputs, and then we can use proxy for tree structure search 538 | else: 539 | if self.use_hess: 540 | H = hess[:, grp_indexer] 541 | else: 542 | H = sample_weight if sample_weight is not None else cp.ones((G.shape[0], 1), dtype=cp.float32) 543 | if self.multioutput_sketch is not None: 544 | G, H = self.multioutput_sketch(G, H) 545 | 546 | group_index[grp_indexer] = n_grp 547 | # grow single group of the tree and get nodes index 548 | train_nodes, valid_nodes = depthwise_grow_tree(tree, n_grp, X, G, H, 549 | row_indexer, col_indexer, self.params, 550 | valid_arrs=val_arrays) 551 | # update nodes group 552 | nodes_group[:, n_grp] = train_nodes 553 | for vn, vp in zip(valid_nodes_group, valid_nodes): 554 | vn[:, n_grp] = vp 555 | 556 | # transform nodes to leaves 557 | tree.set_leaves() 558 | leaves_idx, max_leaves, leaves_grp = cp.asarray(tree.leaves, dtype=cp.int32), tree.max_leaves, \ 559 | cp.arange(len(output_groups), dtype=cp.uint64) 560 | 561 | leaves = apply_values(nodes_group, leaves_grp, leaves_idx) 562 | val_leaves = [apply_values(x, leaves_grp, leaves_idx) for x in valid_nodes_group] 563 | 564 | # perform multiple grad steps 565 | values = calc_node_values(grad, hess, leaves, row_indexer, group_index, max_leaves, self.params['lr'], 566 | lambda_l2=self.params['lambda_l2']) 567 | pred = apply_values(leaves, group_index, values) 568 | 569 | tree.set_borders(self.borders) 570 | 571 | for i in range(1, self.gd_steps): 572 | grad, hess = grad_fn(pred) 573 | values += calc_node_values(grad, hess, leaves, row_indexer, group_index, max_leaves, self.params['lr'], 574 | lambda_l2=self.params['lambda_l2']) 575 | pred = apply_values(leaves, group_index, values) 576 | 577 | # transform leaves to values 578 | val_preds = [apply_values(x, group_index, values) for x in val_leaves] 579 | tree.set_node_values(values.get(), group_index.get()) 580 | 581 | return tree, leaves, pred, val_leaves, val_preds 582 | -------------------------------------------------------------------------------- /py_boost/multioutput/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides tools to multioutput models handling""" 2 | -------------------------------------------------------------------------------- /py_boost/multioutput/sketching.py: -------------------------------------------------------------------------------- 1 | """Defines sketching strategies to simplify multioutput scoring function calculation""" 2 | 3 | try: 4 | import cupy as cp 5 | except Exception: 6 | pass 7 | 8 | try: 9 | from cuml import TruncatedSVD 10 | except ImportError: 11 | pass 12 | 13 | from ..callbacks.callback import Callback 14 | 15 | 16 | class GradSketch(Callback): 17 | """Basic class for sketching strategy. 18 | It should implement __call__ method. 19 | """ 20 | 21 | def __call__(self, grad, hess): 22 | """Method receive raw grad and hess matrices and output new ones that will be used in the tree structure search 23 | 24 | Args: 25 | grad: cp.ndarray, gradients 26 | hess: cp.ndarray, hessians 27 | 28 | Returns: 29 | cp.ndarray, sketched grad 30 | cp.ndarray, sketched hess 31 | """ 32 | return grad, hess 33 | 34 | 35 | class TopOutputsSketch(GradSketch): 36 | """TopOutputs sketching. Use only gradient columns with the highest L2 norm""" 37 | 38 | def __init__(self, topk=1): 39 | """ 40 | 41 | Args: 42 | topk: int, top outputs to use 43 | """ 44 | self.topk = topk 45 | 46 | def __call__(self, grad, hess): 47 | best_idx = (grad ** 2).mean(axis=0).argsort()[-self.topk:] 48 | grad = grad[:, best_idx] 49 | 50 | if hess.shape[1] > 1: 51 | hess = hess[:, best_idx] 52 | 53 | return grad, hess 54 | 55 | 56 | class SVDSketch(GradSketch): 57 | """SVD Sketching. Truncated SVD is used to reduce grad dimensions.""" 58 | 59 | def __init__(self, sample=None, **svd_params): 60 | """ 61 | 62 | Args: 63 | sample: int, subsample to speed up SVD fitting 64 | **svd_params: dict, SVD params, see cuml.TruncatedSVD docs 65 | """ 66 | self.svd_params = {**{'algorithm': 'jacobi', 'n_components': 5, 'n_iter': 5}, **svd_params} 67 | self.sample = sample 68 | self.svd = None 69 | 70 | def before_train(self, build_info): 71 | self.svd = TruncatedSVD(output_type='cupy', **self.svd_params) 72 | 73 | def __call__(self, grad, hess): 74 | 75 | sub_grad = grad 76 | if (self.sample is not None) and (grad.shape[0] > self.sample): 77 | idx = cp.arange(grad.shape[0], dtype=cp.int32) 78 | cp.random.shuffle(idx) 79 | sub_grad = grad[idx[:self.sample]] 80 | 81 | self.svd.fit(sub_grad) 82 | grad = self.svd.transform(grad) 83 | 84 | if hess.shape[1] > 1: 85 | hess = self.svd.transform(hess) 86 | hess = cp.clip(hess, 0.01, None) 87 | 88 | return grad, hess 89 | 90 | def after_iteration(self, build_info): 91 | """Free memory to avoid OOM. 92 | 93 | Args: 94 | build_info: dict 95 | 96 | Returns: 97 | 98 | """ 99 | build_info['mempool'].free_all_blocks() 100 | 101 | def after_train(self, build_info): 102 | self.svd = None 103 | 104 | 105 | class RandomSamplingSketch(GradSketch): 106 | """RandomSampling Sketching. Gradient columns are randomly sampled with probabilities.""" 107 | 108 | def __init__(self, n=1, smooth=0.1, replace=True): 109 | """ 110 | 111 | Args: 112 | n: int, n outputs to select 113 | smooth: float, 0 stands for probabilities proportionally to the sum of squares, 1 stands for uniform. 114 | (0, 1) stands for tradeoff 115 | """ 116 | self.n = n 117 | self.smooth = smooth 118 | self.replace = replace 119 | 120 | def __call__(self, grad, hess): 121 | best_idx = (grad ** 2).mean(axis=0) + 1e-3 122 | pi = best_idx / best_idx.sum() 123 | pi = self.smooth * cp.ones_like(pi) / grad.shape[1] + (1 - self.smooth) * pi 124 | 125 | gg = grad / cp.sqrt(self.n * pi) 126 | rand_idx = cp.random.choice(cp.arange(grad.shape[1]), size=self.n, replace=self.replace, p=pi) 127 | grad = gg[:, rand_idx] 128 | 129 | if hess.shape[1] > 1: 130 | hess = hess[:, rand_idx] 131 | 132 | return grad, hess 133 | 134 | 135 | class RandomProjectionSketch(GradSketch): 136 | """Random projection sketch""" 137 | 138 | def __init__(self, n=1, norm=True): 139 | """ 140 | 141 | Args: 142 | n: int, number of output dimensions 143 | norm: if True use normal distribution, otherwise +1/-1 144 | """ 145 | self.k = n 146 | self.norm = norm 147 | 148 | def __call__(self, grad, hess): 149 | 150 | if self.norm: 151 | P = cp.random.randn(grad.shape[1], self.k, dtype=cp.float32) 152 | else: 153 | P = (cp.random.rand(grad.shape[1], self.k, dtype=cp.float32) > .5).astype(cp.float32) * 2 - 1 154 | 155 | P /= cp.sqrt(1 / self.k) 156 | 157 | grad = cp.dot(grad, P) 158 | 159 | if hess.shape[1] > 1: 160 | hess = cp.dot(hess, P) 161 | hess = cp.clip(hess, 0.01, None) 162 | 163 | return grad, hess 164 | 165 | 166 | class FilterSketch(GradSketch): 167 | """Filter Gradient and Hessian outputs for the tree structure search using previously built trees""" 168 | 169 | def __init__(self, k=1, sample=True, smooth=0.1, ntrees=1): 170 | """ 171 | 172 | Args: 173 | k: int, number of outputs to keep 174 | sample: bool, if True random sampling is used, else keep top K 175 | smooth: float, smoothing parameter for sampling 176 | ntrees: int, number of previously built trees to evaluate weights 177 | """ 178 | self.k = k 179 | self.sample = sample 180 | self.smooth = smooth 181 | self.ntrees = ntrees 182 | 183 | self.queue = None 184 | self.max_trees = 0 185 | self.max_nodes = 0 186 | self.nrows = None 187 | self.lambda_l2 = None 188 | 189 | def before_train(self, build_info): 190 | """Extract metadata before train 191 | 192 | Args: 193 | build_info: dict 194 | 195 | Returns: 196 | 197 | """ 198 | # length of train 199 | self.nrows = build_info['data']['train']['features_gpu'].shape[0] 200 | # lambda_l2 of tree builder 201 | self.lambda_l2 = build_info['builder'].params['lambda_l2'] 202 | self.queue = [] 203 | self.max_nodes = 0 204 | self.max_trees = 0 205 | 206 | def before_iteration(self, build_info): 207 | """Method to extract leaf indices from the last tree 208 | 209 | Args: 210 | build_info: dict 211 | 212 | Returns: 213 | 214 | """ 215 | # num iter 216 | num_iter = build_info['num_iter'] 217 | 218 | # if first iter - assume single node 219 | # if smooth eq 1 - assume totally random choice 220 | if num_iter == 0 or self.smooth >= 1 or self.ntrees == 0: 221 | return 222 | 223 | if len(self.queue) >= self.ntrees: 224 | self.queue.pop(0) 225 | 226 | # leaf values 227 | last_leaves = build_info['data']['train']['last_tree']['leaves'] 228 | self.queue.append(last_leaves) 229 | self.max_nodes = max(int(last_leaves.max()), self.max_nodes) 230 | self.max_trees = max(last_leaves.shape[1], self.max_trees) 231 | 232 | def after_train(self, build_info): 233 | """Clean trees 234 | 235 | Args: 236 | build_info: dict 237 | 238 | Returns: 239 | 240 | """ 241 | self.queue = None 242 | 243 | def _calc_weights(self, grad, hess): 244 | 245 | if self.smooth >= 1: 246 | return 0 247 | 248 | if len(self.queue) == 0: 249 | loss = (grad ** 2).sum(axis=0) 250 | loss /= hess.sum(axis=0) 251 | 252 | return loss 253 | 254 | grad_sum = cp.zeros((len(self.queue), self.max_trees, self.max_nodes, 255 | grad.shape[1],), dtype=cp.float32) 256 | hess_sum = cp.zeros((len(self.queue), self.max_trees, self.max_nodes, 257 | hess.shape[1],), dtype=cp.float32) 258 | 259 | for n, prev_iter in enumerate(self.queue): 260 | for i in range(prev_iter.shape[1]): 261 | grad_sum[n, i].scatter_add(prev_iter[:, i], grad) 262 | hess_sum[n, i].scatter_add(prev_iter[:, i], hess) 263 | 264 | loss = (grad_sum ** 2 / (hess_sum + self.lambda_l2)).reshape((-1, grad.shape[1])) 265 | 266 | return loss.sum(axis=0) 267 | 268 | def _select(self, pi): 269 | 270 | if self.sample: 271 | idx = cp.random.choice(cp.arange(pi.shape[0]), size=self.k, replace=True, p=pi) 272 | else: 273 | idx = pi.argsort()[-self.k:] 274 | 275 | return idx 276 | 277 | def __call__(self, grad, hess): 278 | 279 | pi = self._calc_weights(grad, hess) 280 | pi = pi / pi.sum() 281 | 282 | if self.smooth > 0: 283 | pi = self.smooth * cp.ones_like(pi) / grad.shape[1] + (1 - self.smooth) * pi 284 | 285 | idx = self._select(pi) 286 | 287 | grad = grad[:, idx] 288 | 289 | if hess.shape[1] > 1: 290 | hess = hess[:, idx] 291 | 292 | return grad, hess 293 | -------------------------------------------------------------------------------- /py_boost/multioutput/target_splitter.py: -------------------------------------------------------------------------------- 1 | """Strategies to splitting multiple outputs by different trees""" 2 | 3 | try: 4 | import cupy as cp 5 | except Exception: 6 | pass 7 | 8 | from ..callbacks.callback import Callback 9 | 10 | 11 | class SingleSplitter(Callback): 12 | """Basic splitter, means no split. Single tree will be built at each boosting step""" 13 | 14 | def __init__(self): 15 | self.ensemble_indexer = None 16 | self.indexer = None 17 | 18 | def before_iteration(self, build_info): 19 | """Initialize indexers 20 | 21 | Args: 22 | build_info: dict 23 | 24 | Returns: 25 | 26 | """ 27 | if build_info['num_iter'] == 0: 28 | nout = build_info['data']['train']['grad'].shape[1] 29 | self.indexer = cp.arange(nout, dtype=cp.uint64) 30 | 31 | def __call__(self): 32 | """Get list of indexers for each group 33 | 34 | Returns: 35 | list of cp.ndarrays of indexers 36 | """ 37 | return [self.indexer] 38 | 39 | def after_train(self, build_info): 40 | """Clean state not to keep the indexer in trained model 41 | 42 | Args: 43 | build_info: 44 | 45 | Returns: 46 | 47 | """ 48 | self.__init__() 49 | 50 | 51 | class RandomGroupsSplitter(SingleSplitter): 52 | """Random Groups Splitter, means all outputs will be randomly grouped at each iteration. 53 | Single tree will be created for each group. 54 | """ 55 | 56 | def __init__(self, ngroups=2): 57 | """ 58 | 59 | Args: 60 | ngroups: int, maximum number of groups to split outputs 61 | """ 62 | super().__init__() 63 | self.ngroups = ngroups 64 | self._ngroups = None 65 | 66 | def before_iteration(self, build_info): 67 | """Update groups count with the actual target shape if needed 68 | 69 | Args: 70 | build_info: dict 71 | 72 | Returns: 73 | 74 | """ 75 | super().before_iteration(build_info) 76 | if build_info['num_iter'] == 0: 77 | self._ngroups = min(self.ngroups, build_info['data']['train']['grad'].shape[1]) 78 | 79 | def __call__(self): 80 | """ 81 | 82 | Returns: 83 | list of cp.ndarrays of indexers 84 | """ 85 | cp.random.shuffle(self.indexer) 86 | return cp.array_split(self.indexer, self._ngroups) 87 | 88 | 89 | class OneVsAllSplitter(SingleSplitter): 90 | """One-Vs-All splitter, means build separate tree for each output""" 91 | 92 | def __call__(self): 93 | """ 94 | 95 | Returns: 96 | list of cp.ndarrays of indexers 97 | """ 98 | return cp.array_split(self.indexer, self.indexer.shape[0]) 99 | -------------------------------------------------------------------------------- /py_boost/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools for quantization""" 2 | -------------------------------------------------------------------------------- /py_boost/quantization/base.py: -------------------------------------------------------------------------------- 1 | """Basic quantizer implementations""" 2 | 3 | import numpy as np 4 | from .utils import apply_borders, quantize_features, numba_quantile_1d, numba_uniform_1d, numba_uniquant_1d 5 | 6 | 7 | class Quantizer: 8 | """ 9 | General class for all quantizers 10 | """ 11 | 12 | def __init__(self, sample=None, max_bin=256, min_data_in_bin=3, random_state=42): 13 | """ 14 | 15 | Args: 16 | sample: None or int, subsample size for quantizers 17 | max_bin: int, max bins 18 | min_data_in_bin: int, min bin size 19 | random_state: int 20 | """ 21 | self.sample = sample 22 | # actual nbins eq max_bin - 1, zero bin is always reserved for NaNs 23 | self.max_bin = max_bin 24 | self.min_data_in_bin = min_data_in_bin 25 | self.random_state = random_state 26 | 27 | self.borders = None 28 | 29 | def _sample(self, X): 30 | """Sample train set 31 | 32 | Args: 33 | X: np.ndarray 34 | 35 | Returns: 36 | 37 | """ 38 | if self.sample is not None and self.sample < X.shape[0]: 39 | np.random.seed(self.random_state) 40 | 41 | idx = np.arange(X.shape[0]) 42 | np.random.shuffle(idx) 43 | idx = idx[:self.sample] 44 | return X[idx] 45 | 46 | return X 47 | 48 | def transform(self, X): 49 | """Apply borders is similar for all quantizers 50 | 51 | Args: 52 | X: np.ndarray 53 | 54 | Returns: 55 | 56 | """ 57 | return apply_borders(X, self.borders) 58 | 59 | def fit(self, X): 60 | """Fit quantizer 61 | 62 | Args: 63 | X: np.ndarray 64 | 65 | Returns: 66 | 67 | """ 68 | return self 69 | 70 | def fit_transform(self, X): 71 | """Fit quantizer and transform 72 | 73 | Args: 74 | X: 75 | 76 | Returns: 77 | 78 | """ 79 | self.fit(X) 80 | 81 | return self.transform(X) 82 | 83 | def get_borders(self): 84 | """Get fitted borders 85 | 86 | Returns: 87 | 88 | """ 89 | assert self.borders is not None, 'Should be fitted first' 90 | 91 | return self.borders 92 | 93 | def get_max_bin(self): 94 | """Get actual max bins 95 | 96 | Returns: 97 | 98 | """ 99 | return max(map(len, self.get_borders())) 100 | 101 | 102 | class QuantileQuantizer(Quantizer): 103 | """ 104 | Quantization by quantiles 105 | """ 106 | 107 | def fit(self, X): 108 | self.borders = quantize_features( 109 | 110 | numba_quantile_1d, 111 | self._sample(X), 112 | max_bins=self.max_bin - 1, 113 | min_data_in_bin=self.min_data_in_bin 114 | 115 | ) 116 | 117 | return self 118 | 119 | 120 | class UniformQuantizer(Quantizer): 121 | """ 122 | Uniform quantization 123 | """ 124 | 125 | def fit(self, X): 126 | self.borders = quantize_features( 127 | 128 | numba_uniform_1d, 129 | self._sample(X), 130 | max_bins=self.max_bin - 1, 131 | min_data_in_bin=self.min_data_in_bin 132 | 133 | ) 134 | 135 | return self 136 | 137 | 138 | class UniquantQuantizer(Quantizer): 139 | """ 140 | Mix of uniform and quantile bins 141 | """ 142 | 143 | def fit(self, X): 144 | self.borders = quantize_features( 145 | 146 | numba_uniquant_1d, 147 | self._sample(X), 148 | max_bins=self.max_bin - 1, 149 | min_data_in_bin=self.min_data_in_bin 150 | 151 | ) 152 | 153 | return self 154 | -------------------------------------------------------------------------------- /py_boost/quantization/utils.py: -------------------------------------------------------------------------------- 1 | """Quantization utilities""" 2 | 3 | import numba 4 | import numpy as np 5 | from numba import float32, float64, uint8, prange, njit, int64 6 | 7 | numba.config.THREADING_LAYER = 'threadsafe' 8 | 9 | 10 | def _apply_borders_1d(x_raw, x_enc, borders): 11 | # encode raw values 12 | sl = np.nonzero(~np.isnan(x_raw))[0] 13 | x_enc[sl] = np.searchsorted(borders, x_raw[sl]) 14 | 15 | return 16 | 17 | 18 | sign = [(float64[:], uint8[:], float64[:]), 19 | (float32[:], uint8[:], float32[:]), 20 | ] 21 | 22 | numba_apply_borders_1d = njit(sign, parallel=False)(_apply_borders_1d) 23 | 24 | 25 | def _apply_borders(X, X_enc, borders): 26 | for i in prange(X.shape[1]): 27 | i = int64(i) # to prevent unsafe cast numba warning 28 | numba_apply_borders_1d(X[:, i], X_enc[:, i], borders[i]) 29 | 30 | return 31 | 32 | 33 | numba_apply_borders = njit(parallel=True)(_apply_borders) 34 | 35 | 36 | def apply_borders(X, borders): 37 | X_enc = np.zeros_like(X, dtype=np.uint8, order='C') 38 | numba_apply_borders(X, X_enc, numba.typed.List(borders)) 39 | 40 | return X_enc 41 | 42 | 43 | def _preprocess_1d(x_sample): 44 | x_sample = x_sample[~np.isnan(x_sample)].copy() 45 | neg_inf_clip_value = np.finfo(x_sample).min 46 | x_sample[x_sample < neg_inf_clip_value] = neg_inf_clip_value 47 | x_sample = np.sort(x_sample) 48 | 49 | return x_sample 50 | 51 | 52 | sign = [(float64[:],), (float32[:],), ] 53 | numba_preprocess_1d = njit(sign, parallel=False)(_preprocess_1d) 54 | 55 | 56 | def _quantile_1d(x_sample, max_bins, min_data_in_bin): 57 | x_sample = numba_preprocess_1d(x_sample) 58 | bins = np.unique(x_sample)[:-1] 59 | 60 | if len(bins) > (max_bins - 1): 61 | # get quantiles 62 | grid = (np.linspace(0, 1, max_bins + 1) * x_sample.shape[0])[1:-1].astype(np.int64) 63 | bins = x_sample[grid] 64 | bins = np.unique(bins) 65 | 66 | return bins 67 | 68 | 69 | q1d_sign = [(float64[:], int64, int64), 70 | (float32[:], int64, int64), 71 | ] 72 | 73 | numba_quantile_1d = njit(q1d_sign, parallel=False)(_quantile_1d) 74 | 75 | 76 | def _quantize_features(fn, X, max_bins, min_data_in_bin, borders): 77 | """ 78 | Args: 79 | X: 80 | 81 | Returns: 82 | """ 83 | for i in prange(X.shape[1]): 84 | bins = fn(X[:, i], max_bins, min_data_in_bin) 85 | borders[i, 1: len(bins) + 1] = bins 86 | 87 | return borders 88 | 89 | 90 | numba_quantize_features = njit(parallel=True)(_quantize_features) 91 | 92 | 93 | def quantize_features(fn, X, max_bins=255, min_data_in_bin=3): 94 | """ 95 | Perform feature quantization 96 | Args: 97 | fn: JIT compiled function for 1d quantization 98 | X: np.ndarray, raw features 99 | max_bins: int, maximum number of bins, <= 255 100 | min_data_in_bin: int, sample size for bins construction 101 | Returns: 102 | """ 103 | assert 0 < max_bins <= 255, 'Max bins should be between 0 and 255' 104 | assert min_data_in_bin > 0, 'Min data in bin should be > 0' 105 | 106 | borders_ = np.empty((X.shape[1], max_bins + 1), dtype=X.dtype) 107 | borders_[:] = np.nan 108 | borders_[:, 0] = -np.inf 109 | 110 | numba_quantize_features(fn, X, max_bins, min_data_in_bin, borders_) 111 | borders = [] 112 | 113 | for i in range(X.shape[1]): 114 | j = 0 115 | for j in range(max_bins + 1): 116 | val = borders_[i, j] 117 | if np.isnan(val): 118 | break 119 | borders_[i, j] = np.inf 120 | borders.append(borders_[i, :j + 1]) 121 | 122 | return borders 123 | 124 | 125 | def _uniform_1d(x_sample, max_bins, min_data_in_bin): 126 | x_sample = numba_preprocess_1d(x_sample) 127 | bins = np.unique(x_sample)[:-1] 128 | 129 | if len(bins) > (max_bins - 1): 130 | # get uniform 131 | bins = np.linspace(x_sample[0], x_sample[-1], max_bins + 1)[1:-1].astype(x_sample.dtype) 132 | 133 | return bins 134 | 135 | 136 | numba_uniform_1d = njit(q1d_sign, parallel=False)(_uniform_1d) 137 | 138 | 139 | def _uniquant_1d(x_sample, max_bins, min_data_in_bin): 140 | x_sample = numba_preprocess_1d(x_sample) 141 | bins = np.unique(x_sample)[:-1] 142 | 143 | if len(bins) > (max_bins - 1): 144 | # get uniform 145 | max_bins_u = max_bins // 2 146 | bins_u = np.linspace(x_sample[0], x_sample[-1], max_bins_u + 1)[1:-1].astype(x_sample.dtype) 147 | # get quantile 148 | max_bins_q = max_bins - max_bins_u 149 | grid = (np.linspace(0, 1, max_bins_q + 1) * x_sample.shape[0])[1:-1].astype(np.int64) 150 | bins_q = x_sample[grid] 151 | # merge 152 | bins = np.unique(np.concatenate((bins_u, bins_q))) 153 | 154 | return bins 155 | 156 | 157 | numba_uniquant_1d = njit(q1d_sign, parallel=False)(_uniquant_1d) 158 | -------------------------------------------------------------------------------- /py_boost/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides strategies to row/columns sampling""" 2 | -------------------------------------------------------------------------------- /py_boost/sampling/bagging.py: -------------------------------------------------------------------------------- 1 | """Basic sampling strategy""" 2 | 3 | import numpy as np 4 | try: 5 | import cupy as cp 6 | except Exception: 7 | pass 8 | from ..callbacks.callback import Callback 9 | 10 | 11 | class BaseSampler(Callback): 12 | """Random uniform rows/columns sampler""" 13 | 14 | def __init__(self, sample=1, axis=0): 15 | """ 16 | 17 | Args: 18 | sample: subsample to select at each iteration 19 | axis: int, 0 for rows, 1 for columns 20 | """ 21 | self.sample = sample 22 | self.axis = axis 23 | self.length = None 24 | self.valid_sl = None 25 | self.indexer = None 26 | 27 | def before_train(self, build_info): 28 | """Create indexers 29 | 30 | Args: 31 | build_info: dict 32 | 33 | Returns: 34 | 35 | """ 36 | self.length = build_info['data']['train']['features_gpu'].shape[self.axis] 37 | self.indexer = cp.arange(self.length, dtype=cp.uint64) 38 | if self.sample < 1: 39 | self.valid_sl = cp.zeros(self.length, dtype=cp.bool_) 40 | self.valid_sl[:max(1, int(self.length * self.sample))] = True 41 | 42 | def before_iteration(self, build_info): 43 | """Shuffle indexers 44 | 45 | Args: 46 | build_info: dict 47 | 48 | Returns: 49 | 50 | """ 51 | if self.sample < 1: 52 | cp.random.shuffle(self.valid_sl) 53 | 54 | def __call__(self): 55 | """Get the last actual indexer 56 | 57 | Returns: 58 | 59 | """ 60 | if self.sample == 1: 61 | return self.indexer 62 | 63 | return self.indexer[self.valid_sl] 64 | 65 | def after_train(self, build_info): 66 | """Clean the state 67 | 68 | Args: 69 | build_info: 70 | 71 | Returns: 72 | 73 | """ 74 | self.__init__(sample=self.sample, axis=self.axis) 75 | 76 | 77 | class MVSSampler(Callback): 78 | """ 79 | MVS rows sampler proposed in 80 | https://proceedings.neurips.cc/paper/2019/file/5c8cb735a1ce65dac514233cbd5576d6-Paper.pdf 81 | """ 82 | 83 | def __init__(self, sample=0.1, lmbd='auto', grid_search_steps=100, grid_multiplier=100): 84 | """ 85 | 86 | Args: 87 | sample: float, subsample 88 | lmbd: float or 'auto', lambda hyperparameter 89 | grid_search_steps: float, cut off search steps 90 | grid_multiplier: float, cut off search multiplier 91 | """ 92 | self.sample = sample 93 | self.lmbd = lmbd 94 | self.grid_search_steps = grid_search_steps 95 | self.grid_multiplier = grid_multiplier 96 | self.indexer = None 97 | 98 | def get_probs(self, reg_grad): 99 | 100 | min_ = reg_grad.min() 101 | 102 | grid = cp.linspace(min_, min_ * self.grid_multiplier, self.grid_search_steps, dtype=cp.float32)[cp.newaxis, :] 103 | 104 | probs = cp.clip(reg_grad[:, cp.newaxis] / grid, 0, 1) 105 | sample_rates = probs.mean(axis=0) 106 | best_idx = cp.abs(sample_rates - self.sample).argmin() 107 | 108 | return probs[:, best_idx] 109 | 110 | def before_train(self, build_info): 111 | 112 | return 113 | 114 | def before_iteration(self, build_info): 115 | 116 | train = build_info['data']['train'] 117 | grad, hess = train['grad'], train['hess'] 118 | 119 | if self.lmbd == 'auto': 120 | lmbd = ((grad.sum() / hess.sum()) ** 2).sum() 121 | else: 122 | lmbd = self.lmbd 123 | 124 | mult = grad.shape[1] / hess.shape[1] 125 | 126 | reg_grad = cp.sqrt((grad ** 2).sum(axis=1) + lmbd * (hess ** 2).sum(axis=1) * mult) 127 | 128 | probs = self.get_probs(reg_grad) 129 | 130 | build_info['data']['train']['grad'] = grad / probs[:, cp.newaxis] 131 | sl = probs >= cp.random.rand(grad.shape[0], dtype=cp.float32) 132 | self.indexer = cp.arange(grad.shape[0], dtype=cp.uint64)[sl] 133 | 134 | def __call__(self, *args, **kwargs): 135 | 136 | return self.indexer 137 | 138 | def after_train(self, build_info): 139 | 140 | self.indexer = None 141 | 142 | 143 | class ColumnImportanceSampler(Callback): 144 | """ 145 | This class implements a sampling strategy, 146 | that sample columns in proportion to thier importance at each step 147 | """ 148 | 149 | def __init__(self, rate=0.5, smooth=0.1, 150 | update_freq=10, inverse=False, n_force=None, imp_type='split'): 151 | """ 152 | 153 | Args: 154 | rate: float, sampling rate 155 | smooth: float, smoothing parameter 156 | update_freq: int importance update frequency 157 | inverse: inverse the probability of sampling 158 | n_force: int or None, number of feats to ignore by sample (always select), counts from the end of data 159 | imp_type: str, importance type 160 | 161 | Returns: 162 | 163 | """ 164 | self.rate = rate 165 | self.smooth = smooth 166 | self.update_freq = update_freq 167 | self.inverse = inverse 168 | self.n_force = n_force 169 | self.imp_type = imp_type 170 | self.p = None 171 | self.imp = None 172 | 173 | def update_importance(self, model): 174 | 175 | if self.imp is None: 176 | self.imp = model.get_feature_importance(self.imp_type) 177 | return self.imp 178 | 179 | for tree in model.models[-self.update_freq:]: 180 | if self.imp_type == 'split': 181 | self.imp += tree.feature_importance_split 182 | else: 183 | self.imp += tree.feature_importance_gain 184 | 185 | return self.imp 186 | 187 | def before_iteration(self, build_info): 188 | """ 189 | Define what should be doe before each iteration 190 | """ 191 | # Update feature importance 192 | num_iter = build_info['num_iter'] 193 | 194 | if (num_iter % self.update_freq) == 0: 195 | # update probabilities with actual importance 196 | p = self.update_importance(build_info['model']) + 1e-3 197 | 198 | if self.n_force is not None: 199 | p = p[:-self.n_force] 200 | 201 | p = cp.asarray(p) / (p.sum()) 202 | # inverse if needed 203 | if self.inverse: 204 | p = 1 - p 205 | p = p / p.sum() 206 | # apply smoothing 207 | self.p = p * (1 - self.smooth) + cp.ones_like(p) * self.smooth / p.shape[0] 208 | 209 | def __call__(self): 210 | """ 211 | Method should return the array of indices, that will be used 212 | to grow the tree at the current step 213 | """ 214 | # Sample rows 215 | n = self.p.shape[0] 216 | index = cp.random.choice(cp.arange(n, dtype=cp.uint64), 217 | size=int(self.rate * n), p=self.p) 218 | 219 | if self.n_force is not None: 220 | index = cp.concatenate([index, cp.arange(n, n + self.n_force, dtype=cp.uint64)]) 221 | 222 | return index 223 | 224 | def after_train(self, build_info): 225 | 226 | self.p = None 227 | self.imp = None 228 | -------------------------------------------------------------------------------- /py_boost/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Provides auxiliary utilities""" 2 | -------------------------------------------------------------------------------- /py_boost/utils/logging.py: -------------------------------------------------------------------------------- 1 | """Utils for logging.""" 2 | 3 | import io 4 | import os 5 | import sys 6 | 7 | import logging 8 | from .. import _logger 9 | 10 | formatter_debug = logging.Formatter("%(asctime)s\t[%(levelname)s]\t%(pathname)s.%(funcName)s:%(lineno)d\t%(message)s") 11 | formatter_default = logging.Formatter("[%(asctime)s] %(message)s", "%H:%M:%S") 12 | 13 | INFO2 = 17 14 | INFO3 = 13 15 | 16 | 17 | def add_logging_level(levelName, levelNum, methodName=None): 18 | """ 19 | Comprehensively adds a new logging level to the `logging` module and the 20 | currently configured logging class. 21 | `levelName` becomes an attribute of the `logging` module with the value 22 | `levelNum`. `methodName` becomes a convenience method for both `logging` 23 | itself and the class returned by `logging.getLoggerClass()` (usually just 24 | `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is 25 | used. 26 | To avoid accidental clobberings of existing attributes, this method will 27 | raise an `AttributeError` if the level name is already an attribute of the 28 | `logging` module or if the method name is already present 29 | Example 30 | ------- 31 | >>> addLoggingLevel('TRACE', logging.DEBUG - 5) 32 | >>> logging.getLogger(__name__).setLevel("TRACE") 33 | >>> logging.getLogger(__name__).trace('that worked') 34 | >>> logging.trace('so did this') 35 | >>> logging.TRACE 36 | 5 37 | """ 38 | assert (levelNum > 0) and (levelNum < 50) 39 | if not methodName: 40 | methodName = levelName.lower() 41 | 42 | if hasattr(logging, levelName): 43 | if levelNum == logging.__dict__[levelName]: 44 | print("Level \"{}: {}\" already defined, skipping...".format(levelName, levelNum)) 45 | return 46 | else: 47 | raise AttributeError("{} already defined in logging module".format(levelName)) 48 | if hasattr(logging, methodName): 49 | raise AttributeError("{} already defined in logging module".format(methodName)) 50 | if hasattr(logging.getLoggerClass(), methodName): 51 | raise AttributeError("{} already defined in logger class".format(methodName)) 52 | 53 | def logForLevel(self, message, *args, **kwargs): 54 | if self.isEnabledFor(levelNum): 55 | self._log(levelNum, message, args, **kwargs) 56 | 57 | def logToRoot(message, *args, **kwargs): 58 | logging.log(levelNum, message, *args, **kwargs) 59 | 60 | logging.addLevelName(levelNum, levelName) 61 | setattr(logging, levelName, levelNum) 62 | setattr(logging.getLoggerClass(), methodName, logForLevel) 63 | setattr(logging, methodName, logToRoot) 64 | 65 | 66 | add_logging_level("INFO2", INFO2) 67 | add_logging_level("INFO3", INFO3) 68 | 69 | 70 | class LoggerStream(io.IOBase): 71 | def __init__(self, logger, verbose_eval=100) -> None: 72 | super().__init__() 73 | self.logger = logger 74 | self.verbose_eval = verbose_eval 75 | self.counter = 1 76 | 77 | def write(self, message): 78 | if message == "\n": 79 | return 80 | iter_num = message.split("\t")[0] 81 | if (iter_num == "[1]") or (iter_num == "0:") or ((iter_num[-1] != "]") and (iter_num[-1] != ":")): 82 | self.logger.info3(message.rstrip()) 83 | return 84 | 85 | if self.counter < self.verbose_eval - 1: 86 | self.logger.debug(message.rstrip()) 87 | self.counter += 1 88 | else: 89 | self.logger.info3(message.rstrip()) 90 | self.counter = 0 91 | 92 | 93 | def verbosity_to_loglevel(verbosity: int, extended=True): 94 | if extended: 95 | if verbosity <= 0: 96 | log_level = logging.ERROR 97 | elif verbosity == 1: 98 | log_level = logging.INFO 99 | elif verbosity == 2: 100 | log_level = logging.INFO2 101 | elif verbosity == 3: 102 | log_level = logging.INFO3 103 | else: 104 | log_level = logging.DEBUG 105 | else: 106 | if verbosity <= 0: 107 | log_level = logging.ERROR 108 | elif verbosity == 1: 109 | log_level = logging.INFO 110 | else: 111 | log_level = logging.DEBUG 112 | 113 | return log_level 114 | 115 | 116 | def get_stdout_level(): 117 | for handler in _logger.handlers: 118 | if type(handler) == logging.StreamHandler: 119 | return handler.level 120 | return _logger.getEffectiveLevel() 121 | 122 | 123 | def set_stdout_level(level): 124 | _logger.setLevel(logging.DEBUG) 125 | 126 | has_console_handler = False 127 | 128 | for handler in _logger.handlers: 129 | if type(handler) == logging.StreamHandler: 130 | if handler.level == level: 131 | has_console_handler = True 132 | else: 133 | _logger.handlers.remove(handler) 134 | 135 | if not has_console_handler: 136 | handler = logging.StreamHandler(sys.stdout) 137 | handler.setFormatter(formatter_default) 138 | handler.setLevel(level) 139 | 140 | _logger.addHandler(handler) 141 | 142 | 143 | def add_filehandler(filename: str, level=logging.DEBUG): 144 | if filename: 145 | has_file_handler = False 146 | 147 | for handler in _logger.handlers: 148 | if type(handler) == logging.FileHandler: 149 | if handler.baseFilename == filename or handler.baseFilename == os.path.join(os.getcwd(), filename): 150 | has_file_handler = True 151 | else: 152 | _logger.handlers.remove(handler) 153 | 154 | if not has_file_handler: 155 | file_handler = logging.FileHandler(filename, mode="w") 156 | 157 | if level == logging.DEBUG: 158 | file_handler.setFormatter(formatter_debug) 159 | else: 160 | file_handler.setFormatter(formatter_default) 161 | 162 | file_handler.setLevel(level) 163 | 164 | # if handler_filter: 165 | # file_handler.addFilter(handler_filter) 166 | 167 | _logger.addHandler(file_handler) 168 | else: 169 | for handler in _logger.handlers: 170 | if type(handler) == logging.FileHandler: 171 | _logger.handlers.remove(handler) 172 | 173 | 174 | class DuplicateFilter(object): 175 | def __init__(self): 176 | self.msgs = set() 177 | 178 | def filter(self, record): 179 | rv = record.msg not in self.msgs 180 | self.msgs.add(record.msg) 181 | return rv 182 | -------------------------------------------------------------------------------- /py_boost/utils/onnx_wrapper.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | import onnx 5 | import onnxruntime 6 | import tqdm 7 | from onnx import helper, TensorProto 8 | from onnx.checker import check_model 9 | 10 | 11 | def pb_to_onnx(model, output, fltr=None, post_transform='NONE'): 12 | """Parse the model to ONNX format 13 | 14 | Args: 15 | model: Py-Boost Ensemble 16 | output: str, file path 17 | fltr: str or None, if subset of outputs need to be stored 18 | post_transform: str, one of 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', or 'PROBIT' 19 | 20 | Returns: 21 | 22 | """ 23 | model.to_cpu() 24 | 25 | nout = model.base_score.shape[0] 26 | if fltr is None: 27 | fltr = np.arange(nout) 28 | else: 29 | fltr_ = np.asarray(fltr) 30 | fltr = np.sort(fltr_) 31 | if (fltr != fltr_).any(): 32 | warnings.warn( 33 | 'Selected outputs order changed. Predictions will keep the original model order (fltr array is sorted)' 34 | ) 35 | 36 | nout = len(fltr) 37 | 38 | parsed_ensemble = { 39 | 40 | # const for ensemble 41 | "base_values": model.base_score[fltr].tolist(), 42 | "n_targets": nout, 43 | "aggregate_function": "SUM", 44 | "post_transform": post_transform 45 | 46 | } 47 | 48 | nodes_attr = [ 49 | "nodes_treeids", "nodes_nodeids", "nodes_modes", "nodes_falsenodeids", 50 | "nodes_truenodeids", "nodes_featureids", "nodes_values", "nodes_missing_value_tracks_true" 51 | ] 52 | 53 | leaves_attr = [ 54 | "target_ids", "target_nodeids", "target_treeids", "target_weights" 55 | ] 56 | 57 | for key in nodes_attr + leaves_attr: 58 | parsed_ensemble[key] = [] 59 | 60 | k = 0 61 | for tree in tqdm.tqdm(model.models): 62 | 63 | g = 0 64 | for offset in tree.test_format_offsets: 65 | 66 | offset = offset * 4 67 | outputs = np.setdiff1d(fltr, np.nonzero(tree.group_index != g)[0]) 68 | # old id and new id 69 | nodes, n = [(0, 0)], 0 70 | 71 | while len(nodes) > 0: 72 | 73 | # placeholder for new nodes 74 | new_nodes = [] 75 | 76 | # first - adding the node 77 | for old, new in nodes: 78 | 79 | parsed_ensemble["nodes_treeids"].append(k) 80 | parsed_ensemble["nodes_nodeids"].append(new) 81 | 82 | if old >= 0: 83 | # case - split node 84 | i = old * 4 85 | f, s, l, r = tree.test_format[offset + i: offset + i + 4] 86 | f, l, r = int(f), int(l), int(r) 87 | 88 | parsed_ensemble["nodes_modes"].append("BRANCH_LEQ") 89 | # check NaN condition 90 | nan_left = f < 0 91 | f = abs(f) - 1 92 | 93 | parsed_ensemble["nodes_truenodeids"].append(n + 1) 94 | parsed_ensemble["nodes_falsenodeids"].append(n + 2) 95 | parsed_ensemble["nodes_missing_value_tracks_true"].append(nan_left) 96 | parsed_ensemble["nodes_featureids"].append(f) 97 | parsed_ensemble["nodes_values"].append(float(s)) 98 | new_nodes.extend([(l, n + 1), (r, n + 2)]) 99 | n = n + 2 100 | 101 | else: 102 | # case leaf node 103 | leaf = abs(old) - 1 104 | parsed_ensemble["nodes_modes"].append("LEAF") 105 | # add dummy children info 106 | parsed_ensemble["nodes_truenodeids"].append(-1) 107 | parsed_ensemble["nodes_falsenodeids"].append(-1) 108 | parsed_ensemble["nodes_missing_value_tracks_true"].append(False) 109 | parsed_ensemble["nodes_featureids"].append(-1) 110 | parsed_ensemble["nodes_values"].append(0.0) 111 | # add leaf info 112 | for j, o in zip(outputs, np.searchsorted(fltr, outputs)): 113 | parsed_ensemble["target_ids"].append(o) 114 | parsed_ensemble["target_nodeids"].append(new) 115 | parsed_ensemble["target_treeids"].append(k) 116 | parsed_ensemble["target_weights"].append(float(tree.values[leaf, j])) 117 | 118 | nodes = new_nodes 119 | 120 | k += 1 121 | g += 1 122 | 123 | # create a model 124 | node_proto = helper.make_node( 125 | op_type="TreeEnsembleRegressor", 126 | inputs=["X"], outputs=["Y"], 127 | domain='ai.onnx.ml', 128 | ) 129 | node_proto.attribute.extend([helper.make_attribute(x, parsed_ensemble[x]) for x in parsed_ensemble]) 130 | 131 | X_ft = helper.make_tensor_value_info('X', TensorProto.FLOAT, [None, None]) 132 | Y_out = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [None, nout]) 133 | 134 | graph_def = helper.make_graph( 135 | [node_proto], # nodes 136 | 'py-boost-ensemble', # name 137 | [X_ft], # inputs 138 | [Y_out] # outputs 139 | ) 140 | 141 | model_def = helper.make_model( 142 | graph_def, producer_name="Py-Boost", 143 | opset_imports=[ 144 | onnx.helper.make_opsetid('ai.onnx.ml', 3), 145 | onnx.helper.make_opsetid('', 16), 146 | ] 147 | ) 148 | 149 | check_model(model_def) 150 | 151 | with open(output, "wb") as f: 152 | f.write(model_def.SerializeToString()) 153 | 154 | return 155 | 156 | 157 | class ONNXPredictor: 158 | """ 159 | ONNX parser and CPU predictor. Could be used for inference of Py-Boost model on CPU 160 | """ 161 | 162 | def __init__(self, model, filepath, postprocess_fn=None, fltr=None, post_transform='NONE'): 163 | """ 164 | 165 | Args: 166 | model: Py-Boost model 167 | filepath: str, filepath to save 168 | postprocess_fn: Callable or None, python postprocess_fn. If passed, model postprocessing will be ignored 169 | and replaced 170 | fltr: Sequence, indices to use for inference if needed to filter 171 | post_transform: str, one of 'NONE', ‘SOFTMAX,’ ‘LOGISTIC,’ ‘SOFTMAX_ZERO,’ or ‘PROBIT’. 172 | Built-in ONNX post_transform function. If passed, both model postprocessing and python postprocess_fn 173 | will be ignored 174 | """ 175 | if model is not None: 176 | pb_to_onnx(model, output=filepath, fltr=fltr, post_transform=post_transform) 177 | 178 | self.filepath = filepath 179 | 180 | # store post transform fn 181 | if post_transform != 'NONE': 182 | self.postprocess_fn = None 183 | else: 184 | self.postprocess_fn = postprocess_fn 185 | if postprocess_fn is None and model is not None: 186 | self.postprocess_fn = model.postprocess_fn 187 | 188 | self.sess = None 189 | self._start_session() 190 | 191 | @classmethod 192 | def from_onnx(cls, filepath, postprocess_fn=None): 193 | """Create ONNX predictor from parsed model 194 | 195 | Args: 196 | filepath: str, file path 197 | postprocess_fn: Callable or None 198 | 199 | Returns: 200 | 201 | """ 202 | return cls(None, filepath, postprocess_fn=postprocess_fn, fltr=None, post_transform='NONE') 203 | 204 | def _start_session(self): 205 | """Start inference session 206 | 207 | Returns: 208 | 209 | """ 210 | self.sess = onnxruntime.InferenceSession( 211 | self.filepath, 212 | providers=["CPUExecutionProvider"] 213 | ) 214 | 215 | return 216 | 217 | def predict(self, X): 218 | """Predict with ONNX runtime 219 | 220 | Args: 221 | X: np.ndarray, feature matrix 222 | 223 | Returns: 224 | np.ndarray 225 | """ 226 | 227 | X = X.astype(np.float32, copy=False) 228 | preds = self.sess.run(['Y'], {'X': X})[0] 229 | 230 | if self.postprocess_fn is not None: 231 | preds = self.postprocess_fn(preds) 232 | 233 | return preds 234 | -------------------------------------------------------------------------------- /py_boost/utils/tl_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import joblib 4 | from tqdm import tqdm 5 | import treelite 6 | import treelite_runtime as tl_run 7 | 8 | 9 | def _create_node_deprecated(tree, node_id): 10 | """(DEPRECATED) Create a node of treelite tree 11 | Args: 12 | tree: Py-Boost Tree, tree to parse 13 | node_id: int, node index 14 | Returns: 15 | dict, args of treelite.ModelBuilder.Tree .set_numerical_test_node or .set_leaf_node 16 | """ 17 | feature_id = tree.feats[0][node_id] 18 | 19 | if feature_id >= 0: 20 | left, right = tuple(tree.split[0][node_id]) 21 | node = { 22 | 23 | 'feature_id': feature_id, 24 | 'opname': '<=', 25 | 'threshold': tree.val_splits[0][node_id], 26 | 'default_left': tree.nans[0][node_id], 27 | 'left_child_key': left, 28 | 'right_child_key': right, 29 | } 30 | 31 | return node, left, right 32 | 33 | return {'value': tree.values[tree.leaves[node_id][0]]}, None, None 34 | 35 | 36 | def create_node(tree, node_id, id_gen): 37 | """Create a node of treelite tree 38 | 39 | Args: 40 | tree: Py-Boost Tree, tree to parse 41 | node_id: int, node index in original format tree 42 | id_gen: generator, new id generator 43 | 44 | Returns: 45 | dict, args of treelite.ModelBuilder.Tree .set_numerical_test_node 46 | """ 47 | 48 | assert node_id >= 0 49 | 50 | feature_id = int(tree.test_format[node_id * 4]) 51 | nan_left = feature_id < 0 52 | feature_id = abs(feature_id) - 1 53 | 54 | left = int(tree.test_format[node_id * 4 + 2]) 55 | right = int(tree.test_format[node_id * 4 + 3]) 56 | new_id_left = next(id_gen) 57 | new_id_right = next(id_gen) 58 | node = { 59 | 'feature_id': feature_id, 60 | 'opname': '<=', 61 | 'threshold': tree.test_format[node_id * 4 + 1], 62 | 'default_left': nan_left, 63 | 'left_child_key': new_id_left, 64 | 'right_child_key': new_id_right, 65 | } 66 | 67 | return node, (left, new_id_left), (right, new_id_right) 68 | 69 | 70 | def parse_pb_tree(tree): 71 | """Parse s single Py-Boost Tree to treelite.ModelBuilder.Tree format 72 | 73 | Args: 74 | tree: Py-Boost tree 75 | 76 | Returns: 77 | treelite.ModelBuilder.Tree 78 | """ 79 | assert tree.ngroups == 1, 'Models with more than 1 group are not currently supported' 80 | 81 | def id_generator(): 82 | id_num = 1 83 | while True: 84 | yield id_num 85 | id_num += 1 86 | id_gen = id_generator() 87 | 88 | tl_tree = treelite.ModelBuilder.Tree() 89 | curr_nodes = [(0, 0)] # (old_id, new_id) 90 | 91 | while len(curr_nodes) > 0: 92 | old_id, new_id = curr_nodes.pop(0) 93 | curr_node, left, right = create_node(tree, old_id, id_gen) 94 | tl_tree[new_id].set_numerical_test_node(**curr_node) 95 | 96 | if left[0] >= 0: 97 | curr_nodes.append(left) 98 | else: 99 | tl_tree[left[1]].set_leaf_node(tree.values[abs(left[0]) - 1]) 100 | if right[0] >= 0: 101 | curr_nodes.append(right) 102 | else: 103 | tl_tree[right[1]].set_leaf_node(tree.values[abs(right[0]) - 1]) 104 | 105 | tl_tree[0].set_root() 106 | return tl_tree 107 | 108 | 109 | def _parse_pb_tree_deprecated(tree): 110 | """Parse s single Py-Boost Tree to treelite.ModelBuilder.Tree format 111 | 112 | Args: 113 | tree: Py-Boost tree 114 | 115 | Returns: 116 | treelite.ModelBuilder.Tree 117 | """ 118 | assert tree.ngroups == 1, 'Models with more than 1 group are not currently supported' 119 | 120 | tl_tree = treelite.ModelBuilder.Tree() 121 | curr_nodes = [0] 122 | 123 | while len(curr_nodes) > 0: 124 | 125 | next_nodes = [] 126 | 127 | for node_id in curr_nodes: 128 | 129 | curr_node, left, right = _create_node_deprecated(tree, node_id) 130 | # add node 131 | tl_tree[node_id] 132 | if left is not None: 133 | tl_tree[node_id].set_numerical_test_node(**curr_node) 134 | next_nodes.extend([left, right]) 135 | else: 136 | tl_tree[node_id].set_leaf_node(curr_node['value']) 137 | 138 | curr_nodes = next_nodes 139 | 140 | tl_tree[0].set_root() 141 | 142 | return tl_tree 143 | 144 | 145 | def convert_pb_to_treelite(model): 146 | """Convert Py-Boost Ensemble instance to the treelite.ModelBuilder.Tree 147 | 148 | Args: 149 | model: Py-Boost Tree 150 | 151 | Returns: 152 | treelite.ModelBuilder.Tree 153 | """ 154 | nfeats = model.nfeats 155 | ngroups = model.models[0].values.shape[1] 156 | 157 | builder = treelite.ModelBuilder( 158 | num_feature=nfeats, 159 | num_class=ngroups, 160 | pred_transform='identity_multiclass' if ngroups > 1 else 'identity' 161 | ) 162 | 163 | for tree in tqdm(model.models): 164 | builder.append(parse_pb_tree(tree)) 165 | 166 | # add bias tree 167 | bias_tree = treelite.ModelBuilder.Tree() 168 | bias_tree[0].set_numerical_test_node(**{ 169 | 170 | 'feature_id': 0, 171 | 'opname': '<', 172 | 'threshold': 0, 173 | 'default_left': True, 174 | 'left_child_key': 1, 175 | 'right_child_key': 2 176 | }) 177 | 178 | for i in range(1, 3): 179 | bias_tree[i].set_leaf_node(model.base_score) 180 | 181 | bias_tree[0].set_root() 182 | builder.append(bias_tree) 183 | 184 | return builder 185 | 186 | 187 | class TLCompiledPredictor: 188 | """ 189 | Compiled treelite model saved to predict 190 | """ 191 | 192 | @staticmethod 193 | def _default_postprocess_fn(x): 194 | return x 195 | 196 | def __init__(self, libpath, nthread=None, verbose=False, postprocess_fn=None): 197 | """ 198 | 199 | Args: 200 | libpath: str, path to compiled model 201 | nthread: int or None, number of threads to use 202 | verbose: bool, verbosity mode 203 | postprocess_fn: Callable or None, prediction postprocessing function 204 | """ 205 | self.verbose = verbose 206 | self.nthread = nthread 207 | self.libpath = None 208 | self.set_libpath(libpath) 209 | 210 | self.postprocess_fn = self._default_postprocess_fn 211 | if postprocess_fn is not None: 212 | self.postprocess_fn = postprocess_fn 213 | 214 | def predict(self, X): 215 | """Make prediction 216 | 217 | Args: 218 | X: np.ndarray 219 | 220 | Returns: 221 | np.ndarray 222 | """ 223 | pred = self.predictor.predict(tl_run.DMatrix(X)) 224 | return self.postprocess_fn(pred) 225 | 226 | def set_libpath(self, libpath=None, nthread=None): 227 | """Update library path 228 | 229 | Args: 230 | libpath: 231 | nthread: int, num threads 232 | 233 | Returns: 234 | 235 | """ 236 | if libpath is None: 237 | libpath = self.libpath 238 | if nthread is None: 239 | nthread = self.nthread 240 | self.libpath = os.path.abspath(libpath) 241 | self.predictor = tl_run.Predictor(self.libpath, nthread=nthread, verbose=self.verbose) 242 | 243 | def dump(self, filename): 244 | """Dump instance 245 | 246 | Args: 247 | filename: str, path to save 248 | 249 | Returns: 250 | 251 | """ 252 | self.predictor = None 253 | joblib.dump(self, filename) 254 | 255 | @staticmethod 256 | def load(filename): 257 | """Load instance 258 | 259 | Args: 260 | filename: str, filename 261 | 262 | Returns: 263 | TLCompiledPredictor 264 | """ 265 | predictor = joblib.load(filename) 266 | predictor.set_libpath() 267 | 268 | return predictor 269 | 270 | 271 | class TLPredictor: 272 | """ 273 | Treelite predictor. Could be use for inference via built-in treelite utils 274 | or to compilation to get TLCompiledPredictor 275 | """ 276 | 277 | def __init__(self, model, postprocess_fn=None): 278 | """ 279 | 280 | Args: 281 | model: Py-Boost Ensemble 282 | postprocess_fn: Callable or None, postprocessing function 283 | """ 284 | model.to_cpu() 285 | self.tl_model = convert_pb_to_treelite(model).commit() 286 | 287 | self.postprocess_fn = postprocess_fn 288 | if postprocess_fn is None: 289 | self.postprocess_fn = model.postprocess_fn 290 | 291 | def set_tl_model(self, tl_model): 292 | """Update underlying treelite model 293 | 294 | Args: 295 | tl_model: 296 | 297 | Returns: 298 | 299 | """ 300 | self.tl_model = tl_model 301 | 302 | def compile( 303 | self, 304 | toolchain, 305 | libpath, 306 | params=None, 307 | compiler='ast_native', 308 | verbose=False, 309 | nthread=None, 310 | options=None, 311 | predictor_params=None 312 | ): 313 | """Compile model for faster inference. For the details please see 314 | https://treelite.readthedocs.io/en/latest/tutorials/first.html 315 | 316 | Args: 317 | toolchain: 318 | libpath: 319 | params: 320 | compiler: 321 | verbose: 322 | nthread: 323 | options: 324 | predictor_params: 325 | 326 | Returns: 327 | 328 | """ 329 | 330 | params = {} if params is None else params 331 | params = {**{'parallel_comp': os.cpu_count(), }, **params} 332 | 333 | self.tl_model.export_lib(toolchain, libpath, 334 | params, compiler, verbose, nthread, options) 335 | 336 | if predictor_params is None: 337 | predictor_params = {} 338 | predictor_params = {**{'nthread': nthread}, **predictor_params} 339 | 340 | predictor = TLCompiledPredictor(libpath, postprocess_fn=self.postprocess_fn, **predictor_params) 341 | return predictor 342 | 343 | def predict(self, X, nthread=None): 344 | """Make prediction 345 | 346 | Args: 347 | X: np.ndarray 348 | nthread: int/None, used for prediction 349 | 350 | Returns: 351 | np.ndarray 352 | """ 353 | if nthread is None: 354 | nthread = os.cpu_count() 355 | pred = treelite.gtil.predict(self.tl_model, X, nthread=nthread) 356 | return self.postprocess_fn(pred) 357 | 358 | def dump(self, dirname, rewrite=False): 359 | """Dump treelite Model and predictor instance 360 | 361 | Args: 362 | dirname: str, path to save 363 | rewrite: bool, possible to overwrite 364 | 365 | Returns: 366 | 367 | """ 368 | os.makedirs(dirname, exist_ok=rewrite) 369 | temp = self.tl_model 370 | self.tl_model = None 371 | temp.serialize(os.path.join(dirname, 'model.mod')) 372 | joblib.dump(self, os.path.join(dirname, 'predictor.pkl')) 373 | self.tl_model = temp 374 | 375 | @staticmethod 376 | def load(dirname): 377 | """Load predictor from folder 378 | 379 | Args: 380 | dirname: str, path 381 | 382 | Returns: 383 | TLPredictor 384 | """ 385 | predictor = joblib.load(os.path.join(dirname, 'predictor.pkl')) 386 | predictor.set_tl_model(treelite.Model.deserialize(os.path.join(dirname, 'model.mod'))) 387 | 388 | return predictor 389 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "Py-Boost" 3 | version = "0.5.1" 4 | description = "Python based GBDT" 5 | 6 | authors = [ 7 | "Vakhrushev Anton ", 8 | "Iosipoi Leonid", 9 | "Sergey Kupriyanov" 10 | ] 11 | 12 | readme = "README.md" 13 | 14 | repository = "https://github.com/sb-ai-lab/Py-Boost" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3.8", 17 | "Programming Language :: Python :: 3.9", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Operating System :: OS Independent", 21 | "Intended Audience :: Science/Research", 22 | "Development Status :: 3 - Alpha", 23 | "Environment :: Console", 24 | "Natural Language :: English", 25 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 26 | ] 27 | 28 | [tool.poetry.dependencies] 29 | 30 | python = ">=3.8, <3.12" 31 | 32 | scikit-learn = ">=1" 33 | numpy = "*" 34 | joblib = "*" 35 | numba = "*" 36 | ujson = '*' 37 | 38 | pandas = ">=1" 39 | onnx = ">=1.16, <2" 40 | onnxruntime = ">=1.16, <2" 41 | treelite = "^3" 42 | treelite_runtime = "^3" 43 | 44 | tqdm = ">=4.64.1" 45 | 46 | [build-system] 47 | requires = ["poetry-core>=1.0.0"] 48 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /tutorials/Tutorial_3_Custom_features.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## This tutorial shows how to build custom features in py-boost" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "# Optional: set the device to run\n", 25 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 26 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 27 | "\n", 28 | "os.makedirs('../data', exist_ok=True)\n", 29 | "\n", 30 | "import joblib\n", 31 | "from sklearn.datasets import make_regression\n", 32 | "\n", 33 | "# simple case - just one class is used\n", 34 | "from py_boost import GradientBoosting " 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Generate dummy regression data" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "CPU times: user 2.34 s, sys: 1.7 s, total: 4.05 s\n", 54 | "Wall time: 849 ms\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "%%time\n", 60 | "X, y = make_regression(150000, 100, n_targets=10, random_state=42)\n", 61 | "\n", 62 | "# we need non negative targets for this example\n", 63 | "y = y - y.min(axis=0)\n", 64 | "\n", 65 | "X_test, y_test = X[:50000], y[:50000]\n", 66 | "X, y = X[-50000:], y[-50000:]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "### Custom Loss\n", 74 | "\n", 75 | "As it was mentioned in Tutorial_1, not only string alias is valid value for the loss function, but also the instance of Loss class, which is parent class for all loss function\n", 76 | "\n", 77 | "#### Now let's build our own MSLE (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html) loss function\n", 78 | "\n", 79 | "**Note**: Actually we have the built-in MSLE, so you still could use strinng alias for it" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "import cupy as cp\n", 89 | "from py_boost.gpu.losses import Loss, Metric\n", 90 | "\n", 91 | "class CustomRMSLEMetric(Metric):\n", 92 | " \"\"\"First, let's define eval metric to estimate model quality while training\"\"\"\n", 93 | " \n", 94 | " def error(self, y_true, y_pred):\n", 95 | " \"\"\"\n", 96 | " The simpliest way do define a custom metric is to define .error method\n", 97 | " Just tell py_boost how to calculate error at the each point, for out case it is possible\n", 98 | " If it is not possible (for ex. ROC-AUC), you should define __call__ method\n", 99 | " See the Metric class for the details\n", 100 | " \n", 101 | " At that stage y_true is already in GPU memory, so we use CuPy to handle it.\n", 102 | " Usage is the same as NumPy, just replace np with cp\n", 103 | " \n", 104 | " Note: the metric is calculated against processed input (see CustomMSLELoss below)\n", 105 | " \"\"\"\n", 106 | " return (cp.log1p(y_true) - cp.log1p(y_pred)) ** 2\n", 107 | " \n", 108 | " def compare(self, v0 ,v1):\n", 109 | " \"\"\"\n", 110 | " The last required method is .compare\n", 111 | " It should return True if v0 metric value is better than v1, False othewise\n", 112 | " \"\"\"\n", 113 | " return v0 < v1\n", 114 | " \n", 115 | " def __call__(self, y_true, y_pred, sample_weight=None):\n", 116 | " \"\"\"\n", 117 | " We also update __call__ method to redefine default reduction with square\n", 118 | " \"\"\"\n", 119 | " return super().__call__(y_true, y_pred, sample_weight) ** .5\n", 120 | "\n", 121 | "\n", 122 | "class CustomMSLELoss(Loss):\n", 123 | " \"\"\"Custom MSLE Implementation\"\"\"\n", 124 | " \n", 125 | " def preprocess_input(self, y_true):\n", 126 | " \"\"\"\n", 127 | " This method defines, how raw target should be processed before the train starts\n", 128 | " We expect y_true has shape (n_samples, n_outputs)\n", 129 | " \n", 130 | " Here we will not do the actual preprocess, but just check if targets are non negative\n", 131 | " \n", 132 | " At that stage y_true is already in GPU memory, so we use CuPy to handle it.\n", 133 | " Usage is the same as NumPy, just replace np with cp\n", 134 | " \n", 135 | " Note: All metrics and losses will be computed with this preprocess target\n", 136 | " \"\"\"\n", 137 | " assert (y_true >= 0).all()\n", 138 | " return y_true\n", 139 | " \n", 140 | " def postprocess_output(self, y_pred):\n", 141 | " \"\"\"\n", 142 | " Since we modify the target variable, we also need method, that defines \n", 143 | " how to process model prediction\n", 144 | " \"\"\"\n", 145 | " \n", 146 | " return cp.expm1(y_pred)\n", 147 | " \n", 148 | " def get_grad_hess(self, y_true, y_pred):\n", 149 | " \"\"\"\n", 150 | " This method defines how to calculate gradients and hessians for given loss\n", 151 | " Note that training also supports sample_weight, but its applied outside the loss fn,\n", 152 | " so we don't need to handle it here\n", 153 | " \"\"\" \n", 154 | " # grad should have the same shape as y_pred\n", 155 | " grad = y_pred - cp.log1p(y_true)\n", 156 | " # NOTE: Input could be a matrix in multioutput case!\n", 157 | " # But anyway - hessians are ones for all of them\n", 158 | " # So, we just create (n_samples, 1) array of ones \n", 159 | " # and after that is will be broadcasted over all outputs\n", 160 | " # grad should have the same shape as y_pred or (n_samples, 1)\n", 161 | " hess = cp.ones((y_true.shape[0], 1), dtype=cp.float32)\n", 162 | " \n", 163 | " return grad, hess\n", 164 | "\n", 165 | " def base_score(self, y_true):\n", 166 | " \"\"\"\n", 167 | " One last thing we require to define is base score\n", 168 | " This method defines how to initialize an empty ensemble\n", 169 | " In simplies case it could be just an array of zeros\n", 170 | " But usualy it is better to boost from mean values\n", 171 | " Output shape should be (n_outputs, ) \n", 172 | " \n", 173 | " Note: y_true is already processed array here\n", 174 | " \n", 175 | " \"\"\"\n", 176 | " return cp.log1p(y_true).mean(axis=0)\n", 177 | " \n", 178 | " \n" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 4, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "[14:33:29] Stdout logging level is INFO.\n", 191 | "[14:33:29] GDBT train starts. Max iter 1000, early stopping rounds 100\n", 192 | "[14:33:30] Iter 0; Sample 0, score = 0.24603539557907483; \n", 193 | "[14:33:32] Iter 100; Sample 0, score = 0.1742483282481912; \n", 194 | "[14:33:35] Iter 200; Sample 0, score = 0.1342659688820449; \n", 195 | "[14:33:38] Iter 300; Sample 0, score = 0.10731344416487074; \n", 196 | "[14:33:40] Iter 400; Sample 0, score = 0.08782596307881492; \n", 197 | "[14:33:43] Iter 500; Sample 0, score = 0.07353079220891415; \n", 198 | "[14:33:46] Iter 600; Sample 0, score = 0.06300246387723472; \n", 199 | "[14:33:48] Iter 700; Sample 0, score = 0.05525294291245993; \n", 200 | "[14:33:51] Iter 800; Sample 0, score = 0.049518312947738174; \n", 201 | "[14:33:53] Iter 900; Sample 0, score = 0.045306569498698365; \n", 202 | "[14:33:56] Iter 999; Sample 0, score = 0.04221849120683567; \n", 203 | "CPU times: user 29.4 s, sys: 4.47 s, total: 33.9 s\n", 204 | "Wall time: 32.2 s\n" 205 | ] 206 | }, 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "" 211 | ] 212 | }, 213 | "execution_count": 4, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "%%time\n", 220 | "model = GradientBoosting(CustomMSLELoss(), CustomRMSLEMetric(), lr=0.01, verbose=100, ntrees=1000)\n", 221 | "\n", 222 | "model.fit(X, y, eval_sets=[{'X': X_test, 'y': y_test},])" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 5, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "(50000, 10)" 234 | ] 235 | }, 236 | "execution_count": 5, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "model.predict(X_test).shape" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "### Custom colsample strategy\n", 250 | "\n", 251 | "We could also redefine some other things. Let's see the example of creating our bagging strategy. Most of custom things should be done via Callbak. \n", 252 | "\n", 253 | "To create callback we should inherit Callbak class. There are 4 methods, that could be redefined:\n", 254 | " - before_train - outputs None\n", 255 | " - before_iteration - outputs None\n", 256 | " - after_train - outputs None\n", 257 | " - after_iteration - outputs bool - if training should be stopped after iteration\n", 258 | "\n", 259 | " Methods receive build_info - the state dict, that could be accessed and modifier\n", 260 | "\n", 261 | " Basic build info structure:\n", 262 | "\n", 263 | " build_info = {\n", 264 | " 'data': {\n", 265 | " 'train': {\n", 266 | " 'features_cpu': np.ndarray - raw feature matrix,\n", 267 | " 'features_gpu': cp.ndarray - uint8 quantized feature matrix on GPU,\n", 268 | " 'target': y - cp.ndarray - processed target variable on GPU,\n", 269 | " 'sample_weight': cp.ndarray - processed sample_weight on GPU or None,\n", 270 | " 'ensemble': cp.ndarray - current model prediction (with no postprocessing,\n", 271 | " ex. before sigmoid for logloss) on GPU,\n", 272 | " 'grad': cp.ndarray of gradients on GPU, before first iteration - None,\n", 273 | " 'hess': cp.ndarray of hessians on GPU, before first iteration - None,\n", 274 | "\n", 275 | " 'last_tree': {\n", 276 | " 'leaves': cp.ndarray - nodes indices of the last trained tree,\n", 277 | " 'preds': cp.ndarray - predictions of the last trained tree,\n", 278 | " }\n", 279 | "\n", 280 | " },\n", 281 | " 'valid': {\n", 282 | " 'features_cpu' the same as train, but list, each element corresponds each validation sample,\n", 283 | " 'features_gpu': ...,\n", 284 | " 'target': ...,\n", 285 | " 'sample_weight': ...,\n", 286 | " 'ensemble': ...,\n", 287 | "\n", 288 | " 'last_tree': {\n", 289 | " 'leaves': ...,\n", 290 | " 'preds': ...,\n", 291 | " }\n", 292 | "\n", 293 | " }\n", 294 | " },\n", 295 | " 'borders': list of np.ndarray - list or quantization borders,\n", 296 | " 'model': GradientBoosting - model, that is trained,\n", 297 | " 'mempool': cp.cuda.MemoryPool - memory pool used for train, could be used to clean memory to prevent OOM,\n", 298 | " 'builder': DepthwiseTreeBuilder - the instance of tree builder, contains training params,\n", 299 | "\n", 300 | " 'num_iter': int, current number of iteration,\n", 301 | " 'iter_scores': list of float - list of metric values for all validation sets for the last iteration,\n", 302 | " }\n" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 6, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "import cupy as cp\n", 312 | "from py_boost.callbacks.callback import Callback\n", 313 | "\n", 314 | "class ColumnImportanceSampler(Callback):\n", 315 | " \"\"\"\n", 316 | " This class implements a sampling strategy, \n", 317 | " that sample columns in proportion to thier importance at each step\n", 318 | " \n", 319 | " We should implement __call__ method to use it as sampler\n", 320 | " \"\"\"\n", 321 | " def __init__(self, rate=0.5, smooth=0.1, \n", 322 | " update_freq=10, inverse=False):\n", 323 | " \"\"\"\n", 324 | " \n", 325 | " Args:\n", 326 | " rate: float, sampling rate\n", 327 | " smooth: float, smoothing parameter\n", 328 | " update_freq: int importance update frequency\n", 329 | " inverse: inverse the probability of sampling\n", 330 | "\n", 331 | " Returns:\n", 332 | "\n", 333 | " \"\"\"\n", 334 | " # Custom columnns sampler based on feature importance\n", 335 | " self.rate = rate\n", 336 | " self.smooth = smooth\n", 337 | " self.update_freq = update_freq\n", 338 | " self.inverse = inverse\n", 339 | " \n", 340 | " def before_iteration(self, build_info):\n", 341 | " \"\"\"\n", 342 | " Define what should be doe before each iteration\n", 343 | " \"\"\"\n", 344 | " # Update feature importance\n", 345 | " num_iter = build_info['num_iter']\n", 346 | " \n", 347 | " if (num_iter % self.update_freq) == 0:\n", 348 | " # update probabilities with actual importance\n", 349 | " p = build_info['model'].get_feature_importance() + 1e-3\n", 350 | " p = cp.asarray(p) / (p.sum())\n", 351 | " # inverse if needed\n", 352 | " if self.inverse:\n", 353 | " p = 1 - p\n", 354 | " p = p / p.sum()\n", 355 | " # apply smoothing\n", 356 | " self.p = p * (1 - self.smooth) + cp.ones_like(p) * self.smooth / p.shape[0]\n", 357 | " \n", 358 | " def __call__(self):\n", 359 | " \"\"\"\n", 360 | " Method should return the array of indices, that will be used\n", 361 | " to grow the tree at the current step\n", 362 | " \"\"\"\n", 363 | " # Sample rows\n", 364 | " n = self.p.shape[0]\n", 365 | " index = cp.random.choice(cp.arange(n, dtype=cp.uint64), \n", 366 | " size=int(self.rate * n), p=self.p)\n", 367 | " \n", 368 | " return index" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 7, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "[14:33:57] Stdout logging level is INFO.\n", 381 | "[14:33:57] GDBT train starts. Max iter 1000, early stopping rounds 100\n", 382 | "[14:33:57] Iter 0; Sample 0, score = 0.24644921389665553; \n", 383 | "[14:33:59] Iter 100; Sample 0, score = 0.17590711477798346; \n", 384 | "[14:34:00] Iter 200; Sample 0, score = 0.13484779001390923; \n", 385 | "[14:34:02] Iter 300; Sample 0, score = 0.10826939489014992; \n", 386 | "[14:34:03] Iter 400; Sample 0, score = 0.08943576705705947; \n", 387 | "[14:34:05] Iter 500; Sample 0, score = 0.0753772653073726; \n", 388 | "[14:34:07] Iter 600; Sample 0, score = 0.06446689810058637; \n", 389 | "[14:34:08] Iter 700; Sample 0, score = 0.05593631183289121; \n", 390 | "[14:34:10] Iter 800; Sample 0, score = 0.04973719737069171; \n", 391 | "[14:34:12] Iter 900; Sample 0, score = 0.045194617065396514; \n", 392 | "[14:34:13] Iter 999; Sample 0, score = 0.041950662857688406; \n" 393 | ] 394 | }, 395 | { 396 | "data": { 397 | "text/plain": [ 398 | "" 399 | ] 400 | }, 401 | "execution_count": 7, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "# create model with new sampler \n", 408 | "# if we pass new sampler to the colsample argument it will used instead of default\n", 409 | "# it will also be added to the callback pipeline automatically\n", 410 | "# you should not pass samplers to the callbacks argument\n", 411 | "\n", 412 | "model = GradientBoosting(CustomMSLELoss(), CustomRMSLEMetric(), \n", 413 | " colsample=ColumnImportanceSampler(0.5), \n", 414 | " lr=0.01, verbose=100, ntrees=1000 )\n", 415 | "\n", 416 | "model.fit(X, y, eval_sets=[{'X': X_test, 'y': y_test},])" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 8, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "(50000, 10)" 428 | ] 429 | }, 430 | "execution_count": 8, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "model.predict(X_test).shape" 437 | ] 438 | } 439 | ], 440 | "metadata": { 441 | "kernelspec": { 442 | "display_name": "rapids-22.06", 443 | "language": "python", 444 | "name": "rapids-22.06" 445 | }, 446 | "language_info": { 447 | "codemirror_mode": { 448 | "name": "ipython", 449 | "version": 3 450 | }, 451 | "file_extension": ".py", 452 | "mimetype": "text/x-python", 453 | "name": "python", 454 | "nbconvert_exporter": "python", 455 | "pygments_lexer": "ipython3", 456 | "version": "3.9.13" 457 | } 458 | }, 459 | "nbformat": 4, 460 | "nbformat_minor": 2 461 | } 462 | -------------------------------------------------------------------------------- /tutorials/Tutorial_4_Handle_null_targets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## This tutorial shows how to handle NaN targets in multioutput tasks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "# Optional: set the device to run\n", 25 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 26 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n", 27 | "\n", 28 | "os.makedirs('../data', exist_ok=True)\n", 29 | "import numpy as np\n", 30 | "import joblib\n", 31 | "from sklearn.datasets import make_regression\n", 32 | "\n", 33 | "# simple case - just one class is used\n", 34 | "from py_boost import GradientBoosting\n", 35 | "from py_boost.multioutput.sketching import *" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Generate dummy multilabel task with NaN values in target\n", 43 | "\n", 44 | "Some times it happends that some target values in multioutput task are missing. For example, you are solving multilabel task and some labels are unknown for some of the rows, so acually your target could be one of 0/1/NaN. Normaly you can not using ML algorithms directly in that case, so you can do one of the following:\n", 45 | "\n", 46 | "- Drop NaN rows, but that case you are going to miss some part of the data\n", 47 | "- Train binary models separately, but your model will be more complex and probably overfitted\n", 48 | "- Fill NaNs with 0 or 1, so your labeling will become wrong\n", 49 | "- Use Neural Networks with masked loss function\n", 50 | "\n", 51 | "In Py-Boost you can write the loss wrapper to handle such scenario and train your model directly on known labels ignoring NaNs, and here is shown how.\n", 52 | "\n", 53 | "We will create it as the regression task and then thresholding the target. And then add some random NaNs" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "CPU times: user 2.33 s, sys: 1.66 s, total: 3.99 s\n", 66 | "Wall time: 876 ms\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "%%time\n", 72 | "X, y = make_regression(150000, 100, n_targets=10, random_state=42)\n", 73 | "# binarize\n", 74 | "y = (y > y.mean(axis=0)).astype(np.float32)\n", 75 | "# add some NaNs\n", 76 | "y[np.random.rand(150000, 10) > 0.5] = np.nan\n", 77 | "\n", 78 | "\n", 79 | "X_test, y_test = X[:50000], y[:50000]\n", 80 | "X, y = X[-50000:], y[-50000:]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### NaN loss and metric wrappers\n", 88 | "\n", 89 | "Here it is shown how to write loss wrapper ignoring NaNs" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "import cupy as cp\n", 99 | "from py_boost.gpu.losses import BCELoss\n", 100 | "\n", 101 | "class BCEWithNaNLoss(BCELoss):\n", 102 | " \n", 103 | " def base_score(self, y_true):\n", 104 | " # Replace .mean with nanmean function to calc base score\n", 105 | " means = cp.clip(cp.nanmean(y_true, axis=0), self.clip_value, 1 - self.clip_value)\n", 106 | " return cp.log(means / (1 - means))\n", 107 | " \n", 108 | " def get_grad_hess(self, y_true, y_pred):\n", 109 | " # first, get nan mask for y_true\n", 110 | " mask = cp.isnan(y_true)\n", 111 | " # then, compute loss with any values at nan places just to prevent the exception\n", 112 | " grad, hess = super().get_grad_hess(cp.where(mask, 0, y_true), y_pred)\n", 113 | " # invert mask\n", 114 | " mask = (~mask).astype(cp.float32)\n", 115 | " # multiply grad and hess on inverted mask\n", 116 | " # now grad and hess eq. 0 on NaN points\n", 117 | " # that actually means that prediction on that place should not be updated\n", 118 | " grad = grad * mask\n", 119 | " hess = hess * mask\n", 120 | " \n", 121 | " return grad, hess\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "And here is column-wise roc-auc metric ignoring NaNs" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 4, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from py_boost.gpu.losses.metrics import Metric, auc\n", 138 | "\n", 139 | "class NaNAucMetric(Metric):\n", 140 | " \n", 141 | " def __call__(self, y_true, y_pred, sample_weight=None):\n", 142 | " \n", 143 | " aucs = []\n", 144 | " mask = ~cp.isnan(y_true)\n", 145 | " \n", 146 | " for i in range(y_true.shape[1]):\n", 147 | " m = mask[:, i]\n", 148 | " w = None if sample_weight is None else sample_weight[:, 0][m]\n", 149 | " aucs.append(\n", 150 | " auc(y_true[:, i][m], y_pred[:, i][m], w)\n", 151 | " )\n", 152 | " \n", 153 | " return np.mean(aucs)\n", 154 | " \n", 155 | " def compare(self, v0 ,v1):\n", 156 | "\n", 157 | " return v0 > v1 \n", 158 | " \n", 159 | " \n", 160 | " " 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "[08:31:38] Stdout logging level is INFO.\n", 173 | "[08:31:38] GDBT train starts. Max iter 1000, early stopping rounds 200\n", 174 | "[08:31:39] Iter 0; Sample 0, score = 0.7906884535541213; \n", 175 | "[08:31:41] Iter 100; Sample 0, score = 0.9687261163054176; \n", 176 | "[08:31:44] Iter 200; Sample 0, score = 0.9785187659166686; \n", 177 | "[08:31:46] Iter 300; Sample 0, score = 0.9844858052685057; \n", 178 | "[08:31:49] Iter 400; Sample 0, score = 0.9883780152591723; \n", 179 | "[08:31:51] Iter 500; Sample 0, score = 0.9908004122540589; \n", 180 | "[08:31:54] Iter 600; Sample 0, score = 0.9923353340683694; \n", 181 | "[08:31:57] Iter 700; Sample 0, score = 0.9935137491384962; \n", 182 | "[08:31:59] Iter 800; Sample 0, score = 0.9943018456130359; \n", 183 | "[08:32:02] Iter 900; Sample 0, score = 0.9949417958344802; \n", 184 | "[08:32:04] Iter 999; Sample 0, score = 0.9954331107999328; \n", 185 | "CPU times: user 32.1 s, sys: 1.59 s, total: 33.7 s\n", 186 | "Wall time: 31.9 s\n" 187 | ] 188 | }, 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "" 193 | ] 194 | }, 195 | "execution_count": 5, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "%%time\n", 202 | "model = GradientBoosting(BCEWithNaNLoss(), NaNAucMetric(), lr=0.01,\n", 203 | " verbose=100, ntrees=1000, es=200, multioutput_sketch=RandomProjectionSketch(1))\n", 204 | "\n", 205 | "model.fit(X, y, eval_sets=[{'X': X_test, 'y': y_test},])" 206 | ] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "rapids-22.06", 212 | "language": "python", 213 | "name": "rapids-22.06" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.9.13" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /tutorials/Tutorial_5_ONNX_inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e75f5c6d", 6 | "metadata": {}, 7 | "source": [ 8 | "## This tutorial shows how to convert your model to ONNX and use for CPU inference" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "8f07b0d0", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "# Optional: set the device to run\n", 20 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 21 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 22 | "\n", 23 | "os.makedirs('../data', exist_ok=True)\n", 24 | "\n", 25 | "import numpy as np\n", 26 | "import joblib\n", 27 | "import onnxruntime\n", 28 | "\n", 29 | "from sklearn.datasets import make_regression\n", 30 | "\n", 31 | "from py_boost import GradientBoosting\n", 32 | "from py_boost import pb_to_onnx, ONNXPredictor" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "3aa67d64", 38 | "metadata": {}, 39 | "source": [ 40 | "### Generate dummy multilabel task and train the model" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "id": "12c1d5ec", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "[15:37:33] Stdout logging level is INFO.\n", 54 | "[15:37:33] GDBT train starts. Max iter 100, early stopping rounds 200\n", 55 | "[15:37:34] Iter 0; \n", 56 | "[15:37:37] Iter 99; \n", 57 | "CPU times: user 15.1 s, sys: 1.85 s, total: 16.9 s\n", 58 | "Wall time: 10.1 s\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "%%time\n", 64 | "X, y = make_regression(150000, 100, n_targets=5, random_state=42)\n", 65 | "# binarize\n", 66 | "y = (y > y.mean(axis=0)).astype(np.float32)\n", 67 | "\n", 68 | "model = GradientBoosting(\n", 69 | " 'bce', lr=0.01, verbose=100, \n", 70 | " ntrees=100, es=200, \n", 71 | ")\n", 72 | "model.fit(X, y)\n", 73 | "pp = model.predict(X)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "49202ae7", 79 | "metadata": {}, 80 | "source": [ 81 | "### Convert the model to ONNX\n", 82 | "\n", 83 | "The simpliest way to convert is using `pb_to_onnx` function. Just pass the `py-boost` model and path to store parsed model" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "id": "21e10069", 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stderr", 94 | "output_type": "stream", 95 | "text": [ 96 | "100%|██████████| 100/100 [00:00<00:00, 1723.04it/s]\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "pb_to_onnx(model, '../data/pb_model.onnx')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "8d385bbc", 107 | "metadata": {}, 108 | "source": [ 109 | "Once the parsing is completed, you can run `onnxruntime` session for inference" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "id": "7d7d4a0c", 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "CPU times: user 5.59 s, sys: 131 ms, total: 5.72 s\n", 123 | "Wall time: 395 ms\n" 124 | ] 125 | }, 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "array([[0.6264308 , 0.41568166, 0.5388822 , 0.4261355 , 0.57804173],\n", 130 | " [0.59586126, 0.42369062, 0.56585 , 0.57584757, 0.5392887 ],\n", 131 | " [0.72726965, 0.67056704, 0.49255225, 0.6711969 , 0.635281 ],\n", 132 | " ...,\n", 133 | " [0.5112887 , 0.38028964, 0.4761739 , 0.52265 , 0.4513791 ],\n", 134 | " [0.67362005, 0.54282206, 0.62851644, 0.6090929 , 0.7003519 ],\n", 135 | " [0.56341565, 0.52830017, 0.41594115, 0.43341845, 0.42639387]],\n", 136 | " dtype=float32)" 137 | ] 138 | }, 139 | "execution_count": 4, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "%%time\n", 146 | "\n", 147 | "# start session\n", 148 | "sess = onnxruntime.InferenceSession(\n", 149 | " '../data/pb_model.onnx', \n", 150 | " providers=[\"CPUExecutionProvider\"]\n", 151 | ")\n", 152 | "\n", 153 | "# run inference\n", 154 | "preds = sess.run(['Y'], {'X': X.astype(np.float32, copy=False)})[0]\n", 155 | "preds = 1 / (1 + np.exp(-preds))\n", 156 | "preds" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "id": "50d94691", 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "2.3841858e-07" 169 | ] 170 | }, 171 | "execution_count": 5, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "np.abs(preds - pp).max()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "37a4c052", 183 | "metadata": {}, 184 | "source": [ 185 | "***Note*** : by default, parser only collect the trees and base score info. So, it knows nothing about the postprocessing function, for example `sigmoid` in this case. That's why we apply sigmoid after inference part. But we can pass one of built-in `ONNX` post transforms: 'NONE', 'SOFTMAX', 'LOGISTIC', 'SOFTMAX_ZERO', or 'PROBIT' to avoid this step. Probably it is going to be more efficient:" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "id": "9d32635e", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stderr", 196 | "output_type": "stream", 197 | "text": [ 198 | "100%|██████████| 100/100 [00:00<00:00, 1670.84it/s]\n" 199 | ] 200 | }, 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "CPU times: user 5.58 s, sys: 178 ms, total: 5.76 s\n", 206 | "Wall time: 583 ms\n" 207 | ] 208 | }, 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "array([[0.62643087, 0.41568172, 0.5388822 , 0.42613554, 0.57804173],\n", 213 | " [0.5958613 , 0.42369062, 0.56584996, 0.57584757, 0.5392887 ],\n", 214 | " [0.72726965, 0.67056704, 0.49255228, 0.6711969 , 0.6352811 ],\n", 215 | " ...,\n", 216 | " [0.5112887 , 0.3802896 , 0.47617394, 0.52265 , 0.45137918],\n", 217 | " [0.67362005, 0.54282206, 0.6285165 , 0.6090929 , 0.7003519 ],\n", 218 | " [0.56341565, 0.5283001 , 0.41594112, 0.43341845, 0.42639393]],\n", 219 | " dtype=float32)" 220 | ] 221 | }, 222 | "execution_count": 6, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "%%time\n", 229 | "pb_to_onnx(model, '../data/pb_model.onnx', post_transform='LOGISTIC') # pass built-in post transform\n", 230 | "\n", 231 | "# start session\n", 232 | "sess = onnxruntime.InferenceSession(\n", 233 | " '../data/pb_model.onnx', \n", 234 | " providers=[\"CPUExecutionProvider\"]\n", 235 | ")\n", 236 | "\n", 237 | "# run inference\n", 238 | "preds = sess.run(['Y'], {'X': X.astype(np.float32, copy=False)})[0]\n", 239 | "preds" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 7, 245 | "id": "6efcaaa3", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "2.3841858e-07" 252 | ] 253 | }, 254 | "execution_count": 7, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "np.abs(preds - pp).max()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "id": "bdcc1486", 266 | "metadata": {}, 267 | "source": [ 268 | "***Filter outputs*** . Another option is to convert just a part of outputs to `ONNX`, for the case when we need only particular outputs for inference. For example, we want to keep only 0 and 2 outputs for inference and we don't want to compute the parts of model that doesn't affect the result:" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 8, 274 | "id": "080f1139", 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "name": "stderr", 279 | "output_type": "stream", 280 | "text": [ 281 | "100%|██████████| 100/100 [00:00<00:00, 2039.98it/s]\n" 282 | ] 283 | }, 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "CPU times: user 5.31 s, sys: 178 ms, total: 5.48 s\n", 289 | "Wall time: 528 ms\n" 290 | ] 291 | }, 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "array([[0.62643087, 0.5388822 ],\n", 296 | " [0.5958613 , 0.56584996],\n", 297 | " [0.72726965, 0.49255228],\n", 298 | " ...,\n", 299 | " [0.5112887 , 0.47617394],\n", 300 | " [0.67362005, 0.6285165 ],\n", 301 | " [0.56341565, 0.41594112]], dtype=float32)" 302 | ] 303 | }, 304 | "execution_count": 8, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "%%time\n", 311 | "pb_to_onnx(model, '../data/pb_model.onnx', fltr=[0, 2], post_transform='LOGISTIC') # pass array to filter outputs\n", 312 | "\n", 313 | "# start session\n", 314 | "sess = onnxruntime.InferenceSession(\n", 315 | " '../data/pb_model.onnx', \n", 316 | " providers=[\"CPUExecutionProvider\"]\n", 317 | ")\n", 318 | "\n", 319 | "# run inference\n", 320 | "preds = sess.run(['Y'], {'X': X.astype(np.float32, copy=False)})[0]\n", 321 | "preds" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 9, 327 | "id": "cbf704d7", 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "1.937151e-07" 334 | ] 335 | }, 336 | "execution_count": 9, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "np.abs(preds - pp[:, [0, 2]]).max()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "id": "735bb0d8", 348 | "metadata": {}, 349 | "source": [ 350 | "### Built-in wrapper\n", 351 | "\n", 352 | "As an alternative you can use wrapper that hide all the manipulations with `ONNX` and let you just call fit and predict. You can build wrapper from model:" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 10, 358 | "id": "5e5b46e7", 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "name": "stderr", 363 | "output_type": "stream", 364 | "text": [ 365 | "100%|██████████| 100/100 [00:00<00:00, 1909.37it/s]\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "onnx_predictor = ONNXPredictor(\n", 371 | " model, '../data/pb_model.onnx', \n", 372 | " fltr=[0, 2], \n", 373 | ")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 11, 379 | "id": "c506b652", 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "CPU times: user 4.71 s, sys: 156 ms, total: 4.86 s\n", 387 | "Wall time: 328 ms\n" 388 | ] 389 | }, 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "array([[0.6264308 , 0.5388822 ],\n", 394 | " [0.59586126, 0.56585 ],\n", 395 | " [0.72726965, 0.49255225],\n", 396 | " ...,\n", 397 | " [0.5112887 , 0.4761739 ],\n", 398 | " [0.67362005, 0.62851644],\n", 399 | " [0.56341565, 0.41594115]], dtype=float32)" 400 | ] 401 | }, 402 | "execution_count": 11, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "%%time\n", 409 | "preds = onnx_predictor.predict(X)\n", 410 | "preds" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 12, 416 | "id": "13f372d7", 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/plain": [ 422 | "1.7881393e-07" 423 | ] 424 | }, 425 | "execution_count": 12, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "np.abs(preds - pp[:, [0, 2]]).max()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "id": "2e6ddf9f", 437 | "metadata": {}, 438 | "source": [ 439 | "***Note*** : You can not save `ONNXPredictor` object, since `onnxruntime.InferenceSession` is not pickable. Instead, to use it in the other session, you can restore it from `ONNX` model file. But note that in this case you will loose the information about postprocessing function, if it was not provided as `post_transform` to `ONNXPredictor`.\n", 440 | "\n", 441 | "First option, provide the post_transform to `ONNXPredictor`:" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 13, 447 | "id": "3bb457f2", 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "name": "stderr", 452 | "output_type": "stream", 453 | "text": [ 454 | "100%|██████████| 100/100 [00:00<00:00, 2116.98it/s]\n" 455 | ] 456 | }, 457 | { 458 | "data": { 459 | "text/plain": [ 460 | "1.937151e-07" 461 | ] 462 | }, 463 | "execution_count": 13, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "# build the predictor and save parsed as ../data/pb_model.onnx\n", 470 | "onnx_predictor = ONNXPredictor(\n", 471 | " model, '../data/pb_model.onnx', \n", 472 | " fltr=[0, 2], \n", 473 | " post_transform='LOGISTIC' # provide the ONNX post_transform manually\n", 474 | ")\n", 475 | "\n", 476 | "# create new instance from ../data/pb_model.onnx\n", 477 | "onnx_predictor = ONNXPredictor.from_onnx('../data/pb_model.onnx')\n", 478 | "preds = onnx_predictor.predict(X)\n", 479 | "np.abs(preds - pp[:, [0, 2]]).max()" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "id": "f1aef69c", 485 | "metadata": {}, 486 | "source": [ 487 | "Second, is to provide the python postprocessing function in the new session:" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 14, 493 | "id": "88d0e7eb", 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "name": "stderr", 498 | "output_type": "stream", 499 | "text": [ 500 | "100%|██████████| 100/100 [00:00<00:00, 2232.89it/s]\n" 501 | ] 502 | }, 503 | { 504 | "data": { 505 | "text/plain": [ 506 | "1.7881393e-07" 507 | ] 508 | }, 509 | "execution_count": 14, 510 | "metadata": {}, 511 | "output_type": "execute_result" 512 | } 513 | ], 514 | "source": [ 515 | "# build the predictor and save parsed as ../data/pb_model.onnx\n", 516 | "onnx_predictor = ONNXPredictor(\n", 517 | " model, '../data/pb_model.onnx', \n", 518 | " fltr=[0, 2], \n", 519 | ")\n", 520 | "\n", 521 | "# create new instance from ../data/pb_model.onnx\n", 522 | "onnx_predictor = ONNXPredictor.from_onnx(\n", 523 | " '../data/pb_model.onnx', \n", 524 | " postprocess_fn=lambda x: 1 / (1 + np.exp(-x)) # provide py-boost postprocess_fn manually\n", 525 | ")\n", 526 | "preds = onnx_predictor.predict(X)\n", 527 | "np.abs(preds - pp[:, [0, 2]]).max()" 528 | ] 529 | } 530 | ], 531 | "metadata": { 532 | "kernelspec": { 533 | "display_name": "rapids-env", 534 | "language": "python", 535 | "name": "rapids-env" 536 | }, 537 | "language_info": { 538 | "codemirror_mode": { 539 | "name": "ipython", 540 | "version": 3 541 | }, 542 | "file_extension": ".py", 543 | "mimetype": "text/x-python", 544 | "name": "python", 545 | "nbconvert_exporter": "python", 546 | "pygments_lexer": "ipython3", 547 | "version": "3.10.14" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 5 552 | } 553 | --------------------------------------------------------------------------------