├── .flake8 ├── .github └── workflows │ ├── gpu.yaml │ └── test.yaml ├── .gitignore ├── LICENSE ├── README.md ├── examples ├── format.sh ├── lightgbm_ray ├── __init__.py ├── callback.py ├── examples │ ├── __init__.py │ ├── create_test_data.py │ ├── higgs.py │ ├── higgs_parquet.py │ ├── readme.py │ ├── readme_sklearn_api.py │ ├── simple.py │ ├── simple_dask.py │ ├── simple_modin.py │ ├── simple_predict.py │ ├── simple_ray_dataset.py │ ├── simple_tune.py │ ├── train_on_test_data.py │ └── train_with_ml_dataset.py ├── main.py ├── sklearn.py ├── tests │ ├── __init__.py │ ├── env_info.sh │ ├── release │ │ ├── benchmark_cpu_gpu.py │ │ ├── cluster_cpu.yaml │ │ ├── cluster_gpu.yaml │ │ ├── create_learnable_data.py │ │ ├── create_test_data.py │ │ ├── custom_objective_metric.py │ │ ├── run_e2e_gpu.sh │ │ ├── setup_lightgbm.sh │ │ ├── start_cpu_cluster.sh │ │ ├── start_gpu_cluster.sh │ │ └── submit_cpu_gpu_benchmark.sh │ ├── test_client.py │ ├── test_end_to_end.py │ ├── test_fault_tolerance.py │ ├── test_lightgbm.py │ ├── test_lightgbm_api.py │ └── test_tune.py ├── tune.py └── util.py ├── requirements ├── lint-requirements.txt └── test-requirements.txt ├── run_ci_examples.sh ├── run_ci_tests.sh └── setup.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | inline-quotes = " 4 | ignore = 5 | C408 6 | C417 7 | E121 8 | E123 9 | E126 10 | E203 11 | E226 12 | E24 13 | E704 14 | W503 15 | W504 16 | W605 17 | I 18 | N 19 | B001 20 | B002 21 | B003 22 | B004 23 | B005 24 | B007 25 | B008 26 | B009 27 | B010 28 | B011 29 | B012 30 | B013 31 | B014 32 | B015 33 | B016 34 | B017 35 | avoid-escape = no 36 | # Error E731 is ignored because of the migration from YAPF to Black. 37 | # See https://github.com/ray-project/ray/issues/21315 for more information. 38 | per-file-ignores = 39 | rllib/evaluation/worker_set.py:E731 40 | rllib/evaluation/sampler.py:E731 41 | -------------------------------------------------------------------------------- /.github/workflows/gpu.yaml: -------------------------------------------------------------------------------- 1 | name: GPU on manual trigger 2 | 3 | on: 4 | workflow_dispatch 5 | 6 | jobs: 7 | test_gpu: 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 20 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 3.8 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: 3.8 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | python -m pip install -U anyscale pyyaml 20 | - name: Print environment info 21 | run: | 22 | ./lightgbm_ray/tests/env_info.sh 23 | - name: Set anyscale project 24 | env: 25 | ANYSCALE_PROJECT: ${{ secrets.ANYSCALE_PROJECT }} 26 | run: | 27 | echo "project_id: ${ANYSCALE_PROJECT}" > ./lightgbm_ray/tests/release/.anyscale.yaml 28 | - name: Run end to end GPU test 29 | env: 30 | ANYSCALE_CLI_TOKEN: ${{ secrets.ANYSCALE_CLI_TOKEN }} 31 | run: | 32 | pushd ./lightgbm_ray/tests/release 33 | ./run_e2e_gpu.sh 34 | popd || true 35 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: pytest on push 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: "0 5 * * *" 8 | 9 | jobs: 10 | test_lint: 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 3 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: 3.8 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | python -m pip install codecov 23 | if [ -f requirements/lint-requirements.txt ]; then python -m pip install -r requirements/lint-requirements.txt; fi 24 | - name: Print environment info 25 | run: | 26 | ./lightgbm_ray/tests/env_info.sh 27 | - name: Run format script 28 | run: | 29 | ls -alp 30 | ./format.sh --all 31 | 32 | test_linux_ray_master: 33 | runs-on: ubuntu-latest 34 | timeout-minutes: 160 35 | strategy: 36 | matrix: 37 | python-version: ["3.8", "3.9", "3.10"] 38 | include: 39 | - python-version: "3.8" 40 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl 41 | - python-version: "3.9" 42 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl 43 | - python-version: "3.10" 44 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl 45 | steps: 46 | - uses: actions/checkout@v3 47 | - name: Set up Python ${{ matrix.python-version }} 48 | uses: actions/setup-python@v3 49 | with: 50 | python-version: ${{ matrix.python-version }} 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install -U ${{ matrix.ray-wheel }} 55 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 56 | - name: Install package 57 | run: | 58 | python -m pip install -e . 59 | - name: Print environment info 60 | run: | 61 | ./lightgbm_ray/tests/env_info.sh 62 | - name: Run tests 63 | uses: nick-invision/retry@v2 64 | with: 65 | timeout_minutes: 60 66 | max_attempts: 3 67 | command: bash ./run_ci_tests.sh 68 | - name: Run examples 69 | uses: nick-invision/retry@v2 70 | with: 71 | timeout_minutes: 40 72 | max_attempts: 3 73 | command: bash ./run_ci_examples.sh 74 | 75 | test_linux_ray_release: 76 | runs-on: ubuntu-latest 77 | timeout-minutes: 160 78 | strategy: 79 | matrix: 80 | python-version: ["3.8", "3.9", "3.10"] 81 | steps: 82 | - uses: actions/checkout@v3 83 | - name: Set up Python ${{ matrix.python-version }} 84 | uses: actions/setup-python@v3 85 | with: 86 | python-version: ${{ matrix.python-version }} 87 | - name: Install dependencies 88 | run: | 89 | python -m pip install --upgrade pip 90 | python -m pip install -U ray 91 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 92 | - name: Install package 93 | run: | 94 | python -m pip install -e . 95 | - name: Print environment info 96 | run: | 97 | ./lightgbm_ray/tests/env_info.sh 98 | - name: Run tests 99 | uses: nick-invision/retry@v2 100 | with: 101 | timeout_minutes: 60 102 | max_attempts: 3 103 | command: bash ./run_ci_tests.sh 104 | - name: Run examples 105 | uses: nick-invision/retry@v2 106 | with: 107 | timeout_minutes: 40 108 | max_attempts: 3 109 | command: bash ./run_ci_examples.sh 110 | 111 | test_linux_compat: 112 | # Test compatibility when some optional libraries are missing 113 | # Test runs on latest ray release 114 | runs-on: ubuntu-latest 115 | timeout-minutes: 160 116 | strategy: 117 | matrix: 118 | python-version: ["3.8", "3.9", "3.10"] 119 | steps: 120 | - uses: actions/checkout@v3 121 | - name: Set up Python ${{ matrix.python-version }} 122 | uses: actions/setup-python@v3 123 | with: 124 | python-version: ${{ matrix.python-version }} 125 | - name: Install dependencies 126 | run: | 127 | python -m pip install --upgrade pip 128 | python -m pip install -U ray 129 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 130 | - name: Uninstall unavailable dependencies 131 | # Disables modin and Ray Tune (via tabulate) 132 | run: | 133 | python -m pip uninstall -y modin 134 | python -m pip uninstall -y tabulate 135 | - name: Install package 136 | run: | 137 | python -m pip install -e . 138 | - name: Print environment info 139 | run: | 140 | ./lightgbm_ray/tests/env_info.sh 141 | - name: Run tests 142 | uses: nick-invision/retry@v2 143 | with: 144 | timeout_minutes: 60 145 | max_attempts: 3 146 | command: bash ./run_ci_tests.sh --no-tune 147 | - name: Run examples 148 | uses: nick-invision/retry@v2 149 | with: 150 | timeout_minutes: 40 151 | max_attempts: 3 152 | command: bash ./run_ci_examples.sh --no-tune 153 | 154 | test_linux_cutting_edge: 155 | # Tests on cutting edge, i.e. latest Ray master, latest LightGBM master 156 | runs-on: ubuntu-latest 157 | timeout-minutes: 160 158 | strategy: 159 | matrix: 160 | python-version: ["3.8", "3.9", "3.10"] 161 | include: 162 | - python-version: "3.8" 163 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl 164 | - python-version: "3.9" 165 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl 166 | - python-version: "3.10" 167 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl 168 | steps: 169 | - uses: actions/checkout@v3 170 | - name: Set up Python ${{ matrix.python-version }} 171 | uses: actions/setup-python@v3 172 | with: 173 | python-version: ${{ matrix.python-version }} 174 | - name: Install dependencies 175 | run: | 176 | python -m pip install --upgrade pip 177 | python -m pip install -U ${{ matrix.ray-wheel }} 178 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 179 | - name: Install Ubuntu system dependencies 180 | run: | 181 | sudo apt-get install -y --no-install-recommends ninja-build 182 | - name: Install package 183 | run: | 184 | python -m pip install -e . 185 | - name: Clone LightGBM repo 186 | uses: actions/checkout@v3 187 | with: 188 | repository: microsoft/LightGBM 189 | path: lightgbm 190 | submodules: true 191 | - name: Install LightGBM from source 192 | shell: bash -l {0} 193 | run: | 194 | pushd ${GITHUB_WORKSPACE}/lightgbm/python-package 195 | python --version 196 | python setup.py sdist 197 | pip install -v ./dist/lightgbm-*.tar.gz 198 | popd 199 | - name: Print environment info 200 | run: | 201 | ./lightgbm_ray/tests/env_info.sh 202 | - name: Run tests 203 | uses: nick-invision/retry@v2 204 | with: 205 | timeout_minutes: 60 206 | max_attempts: 3 207 | command: bash ./run_ci_tests.sh 208 | - name: Run examples 209 | uses: nick-invision/retry@v2 210 | with: 211 | timeout_minutes: 40 212 | max_attempts: 3 213 | command: bash ./run_ci_examples.sh 214 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .vscode 132 | 133 | *.lgbm -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- 204 | 205 | Code in python/ray/rllib/{evolution_strategies, dqn} adapted from 206 | https://github.com/openai (MIT License) 207 | 208 | Copyright (c) 2016 OpenAI (http://openai.com) 209 | 210 | Permission is hereby granted, free of charge, to any person obtaining a copy 211 | of this software and associated documentation files (the "Software"), to deal 212 | in the Software without restriction, including without limitation the rights 213 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 214 | copies of the Software, and to permit persons to whom the Software is 215 | furnished to do so, subject to the following conditions: 216 | 217 | The above copyright notice and this permission notice shall be included in 218 | all copies or substantial portions of the Software. 219 | 220 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 221 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 222 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 223 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 224 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 225 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 226 | THE SOFTWARE. 227 | 228 | -------------------------------------------------------------------------------- 229 | 230 | Code in python/ray/rllib/impala/vtrace.py from 231 | https://github.com/deepmind/scalable_agent 232 | 233 | Copyright 2018 Google LLC 234 | 235 | Licensed under the Apache License, Version 2.0 (the "License"); 236 | you may not use this file except in compliance with the License. 237 | You may obtain a copy of the License at 238 | 239 | https://www.apache.org/licenses/LICENSE-2.0 240 | 241 | Unless required by applicable law or agreed to in writing, software 242 | distributed under the License is distributed on an "AS IS" BASIS, 243 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 244 | See the License for the specific language governing permissions and 245 | limitations under the License. 246 | 247 | -------------------------------------------------------------------------------- 248 | Code in python/ray/rllib/ars is adapted from https://github.com/modestyachts/ARS 249 | 250 | Copyright (c) 2018, ARS contributors (Horia Mania, Aurelia Guy, Benjamin Recht) 251 | All rights reserved. 252 | 253 | Redistribution and use of ARS in source and binary forms, with or without 254 | modification, are permitted provided that the following conditions are met: 255 | 256 | 1. Redistributions of source code must retain the above copyright notice, this 257 | list of conditions and the following disclaimer. 258 | 259 | 2. Redistributions in binary form must reproduce the above copyright notice, 260 | this list of conditions and the following disclaimer in the documentation and/or 261 | other materials provided with the distribution. 262 | 263 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 264 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 265 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 266 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 267 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 268 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 269 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 270 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 271 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 272 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 273 | 274 | ------------------ 275 | Code in python/ray/prometheus_exporter.py is adapted from https://github.com/census-instrumentation/opencensus-python/blob/master/contrib/opencensus-ext-prometheus/opencensus/ext/prometheus/stats_exporter/__init__.py 276 | 277 | # Copyright 2018, OpenCensus Authors 278 | # 279 | # Licensed under the Apache License, Version 2.0 (the "License"); 280 | # you may not use this file except in compliance with the License. 281 | # You may obtain a copy of the License at 282 | # 283 | # http://www.apache.org/licenses/LICENSE-2.0 284 | # 285 | # Unless required by applicable law or agreed to in writing, software 286 | # distributed under the License is distributed on an "AS IS" BASIS, 287 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 288 | # See the License for the specific language governing permissions and 289 | # limitations under the License. 290 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Distributed LightGBM on Ray 4 | 5 | ![Build Status](https://github.com/ray-project/lightgbm_ray/workflows/pytest%20on%20push/badge.svg) 6 | [![docs.ray.io](https://img.shields.io/badge/docs-ray.io-blue)](https://docs.ray.io/en/master/lightgbm-ray.html) 7 | 8 | LightGBM-Ray is a distributed backend for 9 | [LightGBM](https://lightgbm.readthedocs.io/), built 10 | on top of 11 | [distributed computing framework Ray](https://ray.io). 12 | 13 | LightGBM-Ray 14 | 15 | - enables [multi-node](#usage) and [multi-GPU](#multi-gpu-training) training 16 | - integrates seamlessly with distributed [hyperparameter optimization](#hyperparameter-tuning) library [Ray Tune](http://tune.io) 17 | - comes with [fault tolerance handling](#fault-tolerance) mechanisms, and 18 | - supports [distributed dataframes and distributed data loading](#distributed-data-loading) 19 | 20 | All releases are tested on large clusters and workloads. 21 | 22 | This package is based on [XGBoost-Ray](https://github.com/ray-project/xgboost_ray). As of now, XGBoost-Ray is a dependency for LightGBM-Ray. 23 | 24 | ## Installation 25 | 26 | You can install the latest LightGBM-Ray release from PIP: 27 | 28 | ```bash 29 | pip install "lightgbm_ray" 30 | ``` 31 | 32 | If you'd like to install the latest master, use this command instead: 33 | 34 | ```bash 35 | pip install "git+https://github.com/ray-project/lightgbm_ray.git#egg=lightgbm_ray" 36 | ``` 37 | 38 | ## Usage 39 | 40 | LightGBM-Ray provides a drop-in replacement for LightGBM's `train` 41 | function. To pass data, a `RayDMatrix` object is required, common 42 | with XGBoost-Ray. You can also use a scikit-learn 43 | interface - see next section. 44 | 45 | Just as in original `lgbm.train()` function, the 46 | [training parameters](https://lightgbm.readthedocs.io/en/latest/Parameters.html) 47 | are passed as the `params` dictionary. 48 | 49 | Ray-specific distributed training parameters are configured with a 50 | `lightgbm_ray.RayParams` object. For instance, you can set 51 | the `num_actors` property to specify how many distributed actors 52 | you would like to use. 53 | 54 | Here is a simplified example (which requires `sklearn`): 55 | 56 | **Training:** 57 | 58 | ```python 59 | from lightgbm_ray import RayDMatrix, RayParams, train 60 | from sklearn.datasets import load_breast_cancer 61 | 62 | train_x, train_y = load_breast_cancer(return_X_y=True) 63 | train_set = RayDMatrix(train_x, train_y) 64 | 65 | evals_result = {} 66 | bst = train( 67 | { 68 | "objective": "binary", 69 | "metric": ["binary_logloss", "binary_error"], 70 | }, 71 | train_set, 72 | evals_result=evals_result, 73 | valid_sets=[train_set], 74 | valid_names=["train"], 75 | verbose_eval=False, 76 | ray_params=RayParams(num_actors=2, cpus_per_actor=2)) 77 | 78 | bst.booster_.save_model("model.lgbm") 79 | print("Final training error: {:.4f}".format( 80 | evals_result["train"]["binary_error"][-1])) 81 | ``` 82 | 83 | **Prediction:** 84 | 85 | ```python 86 | from lightgbm_ray import RayDMatrix, RayParams, predict 87 | from sklearn.datasets import load_breast_cancer 88 | import lightgbm as lgbm 89 | 90 | data, labels = load_breast_cancer(return_X_y=True) 91 | 92 | dpred = RayDMatrix(data, labels) 93 | 94 | bst = lgbm.Booster(model_file="model.lgbm") 95 | pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2)) 96 | 97 | print(pred_ray) 98 | ``` 99 | 100 | ### scikit-learn API 101 | 102 | LightGBM-Ray also features a scikit-learn API fully mirroring pure 103 | LightGBM scikit-learn API, providing a completely drop-in 104 | replacement. The following estimators are available: 105 | 106 | - `RayLGBMClassifier` 107 | - `RayLGBMRegressor` 108 | 109 | Example usage of `RayLGBMClassifier`: 110 | 111 | ```python 112 | from lightgbm_ray import RayLGBMClassifier, RayParams 113 | from sklearn.datasets import load_breast_cancer 114 | from sklearn.model_selection import train_test_split 115 | 116 | seed = 42 117 | 118 | X, y = load_breast_cancer(return_X_y=True) 119 | X_train, X_test, y_train, y_test = train_test_split( 120 | X, y, train_size=0.25, random_state=42) 121 | 122 | clf = RayLGBMClassifier( 123 | n_jobs=2, # In LightGBM-Ray, n_jobs sets the number of actors 124 | random_state=seed) 125 | 126 | # scikit-learn API will automatically convert the data 127 | # to RayDMatrix format as needed. 128 | # You can also pass X as a RayDMatrix, in which case 129 | # y will be ignored. 130 | 131 | clf.fit(X_train, y_train) 132 | 133 | pred_ray = clf.predict(X_test) 134 | print(pred_ray) 135 | 136 | pred_proba_ray = clf.predict_proba(X_test) 137 | print(pred_proba_ray) 138 | 139 | # It is also possible to pass a RayParams object 140 | # to fit/predict/predict_proba methods - will override 141 | # n_jobs set during initialization 142 | 143 | clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2)) 144 | 145 | pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2)) 146 | print(pred_ray) 147 | ``` 148 | 149 | Things to keep in mind: 150 | 151 | - `n_jobs` parameter controls the number of actors spawned. 152 | You can pass a `RayParams` object to the 153 | `fit`/`predict`/`predict_proba` methods as the `ray_params` argument 154 | for greater control over resource allocation. Doing 155 | so will override the value of `n_jobs` with the value of 156 | `ray_params.num_actors` attribute. For more information, refer 157 | to the [Resources](#resources) section below. 158 | - By default `n_jobs` is set to `1`, which means the training 159 | will **not** be distributed. Make sure to either set `n_jobs` 160 | to a higher value or pass a `RayParams` object as outlined above 161 | in order to take advantage of LightGBM-Ray's functionality. 162 | - After calling `fit`, additional evaluation results (e.g. training time, 163 | number of rows, callback results) will be available under 164 | `additional_results_` attribute. 165 | - `eval_` arguments are supported, but early stopping is not. 166 | - LightGBM-Ray's scikit-learn API is based on LightGBM 3.2.1. 167 | While we try to support older LightGBM versions, please note that 168 | this library is only fully tested and supported for LightGBM >= 3.2.1. 169 | 170 | For more information on the scikit-learn API, refer to the [LightGBM documentation](https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api). 171 | 172 | ## Data loading 173 | 174 | Data is passed to LightGBM-Ray via a `RayDMatrix` object. 175 | 176 | The `RayDMatrix` lazy loads data and stores it sharded in the 177 | Ray object store. The Ray LightGBM actors then access these 178 | shards to run their training on. 179 | 180 | A `RayDMatrix` support various data and file types, like 181 | Pandas DataFrames, Numpy Arrays, CSV files and Parquet files. 182 | 183 | Example loading multiple parquet files: 184 | 185 | ```python 186 | import glob 187 | from lightgbm_ray import RayDMatrix, RayFileType 188 | 189 | # We can also pass a list of files 190 | path = list(sorted(glob.glob("/data/nyc-taxi/*/*/*.parquet"))) 191 | 192 | # This argument will be passed to `pd.read_parquet()` 193 | columns = [ 194 | "passenger_count", 195 | "trip_distance", "pickup_longitude", "pickup_latitude", 196 | "dropoff_longitude", "dropoff_latitude", 197 | "fare_amount", "extra", "mta_tax", "tip_amount", 198 | "tolls_amount", "total_amount" 199 | ] 200 | 201 | dtrain = RayDMatrix( 202 | path, 203 | label="passenger_count", # Will select this column as the label 204 | columns=columns, 205 | # ignore=["total_amount"], # Optional list of columns to ignore 206 | filetype=RayFileType.PARQUET) 207 | ``` 208 | 209 | 210 | 211 | ## Hyperparameter Tuning 212 | 213 | LightGBM-Ray integrates with [Ray Tune](https://tune.io) to provide distributed hyperparameter tuning for your 214 | distributed LightGBM models. You can run multiple LightGBM-Ray training runs in parallel, each with a different 215 | hyperparameter configuration, and each training run parallelized by itself. All you have to do is move your training 216 | code to a function, and pass the function to `tune.run`. Internally, `train` will detect if `tune` is being used and will 217 | automatically report results to tune. 218 | 219 | Example using LightGBM-Ray with Ray Tune: 220 | 221 | ```python 222 | from lightgbm_ray import RayDMatrix, RayParams, train 223 | from sklearn.datasets import load_breast_cancer 224 | 225 | num_actors = 2 226 | num_cpus_per_actor = 2 227 | 228 | ray_params = RayParams( 229 | num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) 230 | 231 | def train_model(config): 232 | train_x, train_y = load_breast_cancer(return_X_y=True) 233 | train_set = RayDMatrix(train_x, train_y) 234 | 235 | evals_result = {} 236 | bst = train( 237 | params=config, 238 | dtrain=train_set, 239 | evals_result=evals_result, 240 | valid_sets=[train_set], 241 | valid_names=["train"], 242 | verbose_eval=False, 243 | ray_params=ray_params) 244 | bst.booster_.save_model("model.lgbm") 245 | 246 | from ray import tune 247 | 248 | # Specify the hyperparameter search space. 249 | config = { 250 | "objective": "binary", 251 | "metric": ["binary_logloss", "binary_error"], 252 | "eta": tune.loguniform(1e-4, 1e-1), 253 | "subsample": tune.uniform(0.5, 1.0), 254 | "max_depth": tune.randint(1, 9) 255 | } 256 | 257 | # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` 258 | analysis = tune.run( 259 | train_model, 260 | config=config, 261 | metric="train-binary_error", 262 | mode="min", 263 | num_samples=4, 264 | resources_per_trial=ray_params.get_tune_resources()) 265 | print("Best hyperparameters", analysis.best_config) 266 | ``` 267 | 268 | Also see examples/simple_tune.py for another example. 269 | 270 | ## Fault tolerance 271 | 272 | LightGBM-Ray leverages the stateful Ray actor model to 273 | enable fault tolerant training. Currently, only non-elastic 274 | training is supported. 275 | 276 | ### Non-elastic training (warm restart) 277 | 278 | When an actor or node dies, LightGBM-Ray will retain the 279 | state of the remaining actors. In non-elastic training, 280 | the failed actors will be replaced as soon as resources 281 | are available again. Only these actors will reload their 282 | parts of the data. Training will resume once all actors 283 | are ready for training again. 284 | 285 | You can configure this mode in the `RayParams`: 286 | 287 | ```python 288 | from lightgbm_ray import RayParams 289 | 290 | ray_params = RayParams( 291 | max_actor_restarts=2, # How often are actors allowed to fail, Default = 0 292 | ) 293 | ``` 294 | 295 | ## Resources 296 | 297 | By default, LightGBM-Ray tries to determine the number of CPUs 298 | available and distributes them evenly across actors. 299 | 300 | In the case of very large clusters or clusters with many different 301 | machine sizes, it makes sense to limit the number of CPUs per actor 302 | by setting the `cpus_per_actor` argument. Consider always 303 | setting this explicitly. 304 | 305 | The number of LightGBM actors always has to be set manually with 306 | the `num_actors` argument. 307 | 308 | ### Multi GPU training 309 | 310 | By default, LightGBM-Ray tries to determine the number of CPUs 311 | available and distributes them evenly across actors. 312 | 313 | It is important to note that distributed LightGBM needs at least 314 | two CPUs per actor to function efficiently (without blocking). 315 | Therefore, by default, at least two CPUs will be assigned to each actor, 316 | and an exception will be raised if an actor has less than two CPUs. 317 | It is possible to override this check by setting the 318 | `allow_less_than_two_cpus` argument to `True`, though it is not 319 | recommended, as it will negatively impact training performance. 320 | 321 | In the case of very large clusters or clusters with many different 322 | machine sizes, it makes sense to limit the number of CPUs per actor 323 | by setting the `cpus_per_actor` argument. Consider always 324 | setting this explicitly. 325 | 326 | The number of LightGBM actors always has to be set manually with 327 | the `num_actors` argument. 328 | 329 | ### Multi GPU training 330 | LightGBM-Ray enables multi GPU training. The LightGBM core backend 331 | will automatically handle communication. 332 | All you have to do is to start one actor per GPU and set LightGBM's 333 | `device_type` to a GPU-compatible option, eg. `gpu` (see LightGBM 334 | documentation for more details.) 335 | 336 | For instance, if you have 2 machines with 4 GPUs each, you will want 337 | to start 8 remote actors, and set `gpus_per_actor=1`. There is usually 338 | no benefit in allocating less (e.g. 0.5) or more than one GPU per actor. 339 | 340 | You should divide the CPUs evenly across actors per machine, so if your 341 | machines have 16 CPUs in addition to the 4 GPUs, each actor should have 342 | 4 CPUs to use. 343 | 344 | ```python 345 | from lightgbm_ray import RayParams 346 | 347 | ray_params = RayParams( 348 | num_actors=8, 349 | gpus_per_actor=1, 350 | cpus_per_actor=4, # Divide evenly across actors per machine 351 | ) 352 | ``` 353 | 354 | ### How many remote actors should I use? 355 | 356 | This depends on your workload and your cluster setup. 357 | Generally there is no inherent benefit of running more than 358 | one remote actor per node for CPU-only training. This is because 359 | LightGBM core can already leverage multiple CPUs via threading. 360 | 361 | However, there are some cases when you should consider starting 362 | more than one actor per node: 363 | 364 | - For [**multi GPU training**](#multi-gpu-training), each GPU should have a separate 365 | remote actor. Thus, if your machine has 24 CPUs and 4 GPUs, 366 | you will want to start 4 remote actors with 6 CPUs and 1 GPU 367 | each 368 | - In a **heterogeneous cluster**, you might want to find the 369 | [greatest common divisor](https://en.wikipedia.org/wiki/Greatest_common_divisor) 370 | for the number of CPUs. 371 | E.g. for a cluster with three nodes of 4, 8, and 12 CPUs, respectively, 372 | you should set the number of actors to 6 and the CPUs per 373 | actor to 4. 374 | 375 | ## Distributed data loading 376 | 377 | LightGBM-Ray can leverage both centralized and distributed data loading. 378 | 379 | In **centralized data loading**, the data is partitioned by the head node 380 | and stored in the object store. Each remote actor then retrieves their 381 | partitions by querying the Ray object store. Centralized loading is used 382 | when you pass centralized in-memory dataframes, such as Pandas dataframes 383 | or Numpy arrays, or when you pass a single source file, such as a single CSV 384 | or Parquet file. 385 | 386 | 387 | ```python 388 | from lightgbm_ray import RayDMatrix 389 | 390 | # This will use centralized data loading, as only one source file is specified 391 | # `label_col` is a column in the CSV, used as the target label 392 | ray_params = RayDMatrix("./source_file.csv", label="label_col") 393 | ``` 394 | 395 | In **distributed data loading**, each remote actor loads their data directly from 396 | the source (e.g. local hard disk, NFS, HDFS, S3), 397 | without a central bottleneck. The data is still stored in the 398 | object store, but locally to each actor. This mode is used automatically 399 | when loading data from multiple CSV or Parquet files. Please note that 400 | we do not check or enforce partition sizes in this case - it is your job 401 | to make sure the data is evenly distributed across the source files. 402 | 403 | ```python 404 | from lightgbm_ray import RayDMatrix 405 | 406 | # This will use distributed data loading, as four source files are specified 407 | # Please note that you cannot schedule more than four actors in this case. 408 | # `label_col` is a column in the Parquet files, used as the target label 409 | ray_params = RayDMatrix([ 410 | "hdfs:///tmp/part1.parquet", 411 | "hdfs:///tmp/part2.parquet", 412 | "hdfs:///tmp/part3.parquet", 413 | "hdfs:///tmp/part4.parquet", 414 | ], label="label_col") 415 | ``` 416 | 417 | Lastly, LightGBM-Ray supports **distributed dataframe** representations, such 418 | as [Ray Datasets](https://docs.ray.io/en/latest/data/dataset.html), 419 | [Modin](https://modin.readthedocs.io/en/latest/) and 420 | [Dask dataframes](https://docs.dask.org/en/latest/dataframe.html) 421 | (used with [Dask on Ray](https://docs.ray.io/en/master/dask-on-ray.html)). 422 | Here, LightGBM-Ray will check on which nodes the distributed partitions 423 | are currently located, and will assign partitions to actors in order to 424 | minimize cross-node data transfer. Please note that we also assume here 425 | that partition sizes are uniform. 426 | 427 | ```python 428 | from lightgbm_ray import RayDMatrix 429 | 430 | # This will try to allocate the existing Modin partitions 431 | # to co-located Ray actors. If this is not possible, data will 432 | # be transferred across nodes 433 | ray_params = RayDMatrix(existing_modin_df) 434 | ``` 435 | 436 | ### Data sources 437 | 438 | The following data sources can be used with a `RayDMatrix` object. 439 | 440 | | Type | Centralized loading | Distributed loading | 441 | |------------------------------------------------------------------|---------------------|---------------------| 442 | | Numpy array | Yes | No | 443 | | Pandas dataframe | Yes | No | 444 | | Single CSV | Yes | No | 445 | | Multi CSV | Yes | Yes | 446 | | Single Parquet | Yes | No | 447 | | Multi Parquet | Yes | Yes | 448 | | [Ray Dataset](https://docs.ray.io/en/latest/data/dataset.html) | Yes | Yes | 449 | | [Petastorm](https://github.com/uber/petastorm) | Yes | Yes | 450 | | [Dask dataframe](https://docs.dask.org/en/latest/dataframe.html) | Yes | Yes | 451 | | [Modin dataframe](https://modin.readthedocs.io/en/latest/) | Yes | Yes | 452 | 453 | ## Memory usage 454 | 455 | Details coming soon. 456 | 457 | 494 | 495 | **Best practices** 496 | 497 | In order to reduce peak memory usage, consider the following 498 | suggestions: 499 | 500 | - Store data as `float32` or less. More precision is often 501 | not needed, and keeping data in a smaller format will 502 | help reduce peak memory usage for initial data loading. 503 | - Pass the `dtype` when loading data from CSV. Otherwise, 504 | floating point values will be loaded as `np.float64` 505 | per default, increasing peak memory usage by 33%. 506 | 507 | ## Placement Strategies 508 | 509 | LightGBM-Ray leverages Ray's Placement Group API (https://docs.ray.io/en/master/placement-group.html) 510 | to implement placement strategies for better fault tolerance. 511 | 512 | By default, a SPREAD strategy is used for training, which attempts to spread all of the training workers 513 | across the nodes in a cluster on a best-effort basis. This improves fault tolerance since it minimizes the 514 | number of worker failures when a node goes down, but comes at a cost of increased inter-node communication 515 | To disable this strategy, set the `RXGB_USE_SPREAD_STRATEGY` environment variable to 0. If disabled, no 516 | particular placement strategy will be used. 517 | 518 | 520 | 521 | When LightGBM-Ray is used with Ray Tune for hyperparameter tuning, a PACK strategy is used. This strategy 522 | attempts to place all workers for each trial on the same node on a best-effort basis. This means that if a node 523 | goes down, it will be less likely to impact multiple trials. 524 | 525 | When placement strategies are used, LightGBM-Ray will wait for 100 seconds for the required resources 526 | to become available, and will fail if the required resources cannot be reserved and the cluster cannot autoscale 527 | to increase the number of resources. You can change the `RXGB_PLACEMENT_GROUP_TIMEOUT_S` environment variable to modify 528 | how long this timeout should be. 529 | 530 | ## More examples 531 | 532 | For complete end to end examples, please have a look at 533 | the [examples folder](https://github.com/ray-project/lightgbm_ray/tree/main/lightgbm_ray/examples/): 534 | 535 | * [Simple sklearn breastcancer dataset example](https://github.com/ray-project/lightgbm_ray/tree/main/lightgbm_ray/examples/simple.py) (requires `sklearn`) 536 | * [HIGGS classification example](https://github.com/ray-project/lightgbm_ray/tree/main/lightgbm_ray/examples/higgs.py) 537 | ([download dataset (2.6 GB)](https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz)) 538 | * [HIGGS classification example with Parquet](https://github.com/ray-project/lightgbm_ray/tree/main/lightgbm_ray/examples/higgs_parquet.py) (uses the same dataset) 539 | * [Test data classification](https://github.com/ray-project/lightgbm_ray/tree/main/lightgbm_ray/examples/train_on_test_data.py) (uses a self-generated dataset) 540 | 541 | ## Resources 542 | 543 | * [LightGBM-Ray documentation](https://docs.ray.io/en/master/lightgbm-ray.html) 544 | * [Ray community slack](https://forms.gle/9TSdDYUgxYs8SA9e8) 545 | 546 | 581 | -------------------------------------------------------------------------------- /examples: -------------------------------------------------------------------------------- 1 | lightgbm_ray/examples/ -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Black + Clang formatter (if installed). This script formats all changed files from the last mergebase. 3 | # You are encouraged to run this locally before pushing changes for review. 4 | 5 | # Cause the script to exit if a single command fails 6 | set -euo pipefail 7 | 8 | FLAKE8_VERSION_REQUIRED="3.9.1" 9 | BLACK_VERSION_REQUIRED="22.10.0" 10 | SHELLCHECK_VERSION_REQUIRED="0.7.1" 11 | ISORT_VERSION_REQUIRED="5.10.1" 12 | 13 | check_python_command_exist() { 14 | VERSION="" 15 | case "$1" in 16 | black) 17 | VERSION=$BLACK_VERSION_REQUIRED 18 | ;; 19 | flake8) 20 | VERSION=$FLAKE8_VERSION_REQUIRED 21 | ;; 22 | isort) 23 | VERSION=$ISORT_VERSION_REQUIRED 24 | ;; 25 | *) 26 | echo "$1 is not a required dependency" 27 | exit 1 28 | esac 29 | if ! [ -x "$(command -v "$1")" ]; then 30 | echo "$1 not installed. Install the python package with: pip install $1==$VERSION" 31 | exit 1 32 | fi 33 | } 34 | 35 | check_docstyle() { 36 | echo "Checking docstyle..." 37 | violations=$(git ls-files | grep '.py$' | xargs grep -E '^[ ]+[a-z_]+ ?\([a-zA-Z]+\): ' | grep -v 'str(' | grep -v noqa || true) 38 | if [[ -n "$violations" ]]; then 39 | echo 40 | echo "=== Found Ray docstyle violations ===" 41 | echo "$violations" 42 | echo 43 | echo "Per the Google pydoc style, omit types from pydoc args as they are redundant: https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#code-style " 44 | echo "If this is a false positive, you can add a '# noqa' comment to the line to ignore." 45 | exit 1 46 | fi 47 | return 0 48 | } 49 | 50 | check_python_command_exist black 51 | check_python_command_exist flake8 52 | check_python_command_exist isort 53 | 54 | # this stops git rev-parse from failing if we run this from the .git directory 55 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")" 56 | 57 | ROOT="$(git rev-parse --show-toplevel)" 58 | builtin cd "$ROOT" || exit 1 59 | 60 | # NOTE(edoakes): black version differs based on installation method: 61 | # Option 1) 'black, 21.12b0 (compiled: no)' 62 | # Option 2) 'black, version 21.12b0' 63 | # For newer versions (at least 22.10.0), a second line is printed which must be dropped: 64 | # 65 | # black, 22.10.0 (compiled: yes) 66 | # Python (CPython) 3.9.13 67 | BLACK_VERSION_STR=$(black --version) 68 | if [[ "$BLACK_VERSION_STR" == *"compiled"* ]] 69 | then 70 | BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $2}') 71 | else 72 | BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $3}') 73 | fi 74 | FLAKE8_VERSION=$(flake8 --version | head -n 1 | awk '{print $1}') 75 | ISORT_VERSION=$(isort --version | grep VERSION | awk '{print $2}') 76 | 77 | # params: tool name, tool version, required version 78 | tool_version_check() { 79 | if [ "$2" != "$3" ]; then 80 | echo "WARNING: Ray uses $1 $3, You currently are using $2. This might generate different results." 81 | fi 82 | } 83 | 84 | tool_version_check "flake8" "$FLAKE8_VERSION" "$FLAKE8_VERSION_REQUIRED" 85 | tool_version_check "black" "$BLACK_VERSION" "$BLACK_VERSION_REQUIRED" 86 | tool_version_check "isort" "$ISORT_VERSION" "$ISORT_VERSION_REQUIRED" 87 | 88 | if command -v shellcheck >/dev/null; then 89 | SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}') 90 | tool_version_check "shellcheck" "$SHELLCHECK_VERSION" "$SHELLCHECK_VERSION_REQUIRED" 91 | else 92 | echo "INFO: Ray uses shellcheck for shell scripts, which is not installed. You may install shellcheck=$SHELLCHECK_VERSION_REQUIRED with your system package manager." 93 | fi 94 | 95 | if command -v clang-format >/dev/null; then 96 | CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}') 97 | tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.0" 98 | else 99 | echo "WARNING: clang-format is not installed!" 100 | fi 101 | 102 | if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then 103 | echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes" 104 | fi 105 | 106 | if [[ $(flake8 --version) != *"flake8-bugbear"* ]]; then 107 | echo "WARNING: Ray uses flake8 with flake8-bugbear. Might error without it. Install with: pip install flake8-bugbear" 108 | fi 109 | 110 | SHELLCHECK_FLAGS=( 111 | --exclude=1090 # "Can't follow non-constant source. Use a directive to specify location." 112 | --exclude=1091 # "Not following {file} due to some error" 113 | --exclude=2207 # "Prefer mapfile or read -a to split command output (or quote to avoid splitting)." -- these aren't compatible with macOS's old Bash 114 | ) 115 | 116 | 117 | BLACK_EXCLUDES=( 118 | '--force-exclude' 119 | 'python/ray/cloudpickle/*|'` 120 | `'python/build/*|'` 121 | `'python/ray/core/src/ray/gcs/*|'` 122 | `'python/ray/thirdparty_files/*|'` 123 | `'python/ray/_private/thirdparty/*|'` 124 | `'python/ray/serve/tests/test_config_files/syntax_error\.py' 125 | ) 126 | 127 | GIT_LS_EXCLUDES=( 128 | ':(exclude)python/ray/cloudpickle/' 129 | ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py' 130 | ) 131 | 132 | # TODO(barakmich): This should be cleaned up. I've at least excised the copies 133 | # of these arguments to this location, but the long-term answer is to actually 134 | # make a flake8 config file 135 | FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605" 136 | 137 | shellcheck_scripts() { 138 | shellcheck "${SHELLCHECK_FLAGS[@]}" "$@" 139 | } 140 | 141 | # Format specified files 142 | format_files() { 143 | local shell_files=() python_files=() bazel_files=() 144 | 145 | local name 146 | for name in "$@"; do 147 | local base="${name%.*}" 148 | local suffix="${name#"${base}"}" 149 | 150 | local shebang="" 151 | read -r shebang < "${name}" || true 152 | case "${shebang}" in 153 | '#!'*) 154 | shebang="${shebang#/usr/bin/env }" 155 | shebang="${shebang%% *}" 156 | shebang="${shebang##*/}" 157 | ;; 158 | esac 159 | 160 | if [ "${base}" = "WORKSPACE" ] || [ "${base}" = "BUILD" ] || [ "${suffix}" = ".BUILD" ] || [ "${suffix}" = ".bazel" ] || [ "${suffix}" = ".bzl" ]; then 161 | bazel_files+=("${name}") 162 | elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang#python}" ] || [ "${suffix}" != "${suffix#.py}" ]; then 163 | python_files+=("${name}") 164 | elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang%sh}" ] || [ "${suffix}" != "${suffix#.sh}" ]; then 165 | shell_files+=("${name}") 166 | else 167 | echo "error: failed to determine file type: ${name}" 1>&2 168 | return 1 169 | fi 170 | done 171 | 172 | if [ 0 -lt "${#python_files[@]}" ]; then 173 | isort "${python_files[@]}" 174 | black "${python_files[@]}" 175 | fi 176 | 177 | if command -v shellcheck >/dev/null; then 178 | if shellcheck --shell=sh --format=diff - < /dev/null; then 179 | if [ 0 -lt "${#shell_files[@]}" ]; then 180 | local difference 181 | difference="$(shellcheck_scripts --format=diff "${shell_files[@]}" || true && printf "-")" 182 | difference="${difference%-}" 183 | printf "%s" "${difference}" | patch -p1 184 | fi 185 | else 186 | echo "error: this version of shellcheck does not support diffs" 187 | fi 188 | fi 189 | } 190 | 191 | format_all_scripts() { 192 | command -v flake8 &> /dev/null; 193 | HAS_FLAKE8=$? 194 | 195 | # Run isort before black to fix imports and let black deal with file format. 196 | echo "$(date)" "isort...." 197 | git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ 198 | isort 199 | echo "$(date)" "Black...." 200 | git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ 201 | black "${BLACK_EXCLUDES[@]}" 202 | if [ $HAS_FLAKE8 ]; then 203 | echo "$(date)" "Flake8...." 204 | git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \ 205 | flake8 --config=.flake8 206 | fi 207 | 208 | if command -v shellcheck >/dev/null; then 209 | local shell_files non_shell_files 210 | non_shell_files=($(git ls-files -- ':(exclude)*.sh')) 211 | shell_files=($(git ls-files -- '*.sh')) 212 | if [ 0 -lt "${#non_shell_files[@]}" ]; then 213 | shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) 214 | fi 215 | if [ 0 -lt "${#shell_files[@]}" ]; then 216 | echo "$(date)" "shellcheck scripts...." 217 | shellcheck_scripts "${shell_files[@]}" 218 | fi 219 | fi 220 | } 221 | 222 | # Format files that differ from main branch. Ignores dirs that are not slated 223 | # for autoformat yet. 224 | format_changed() { 225 | # The `if` guard ensures that the list of filenames is not empty, which 226 | # could cause the formatter to receive 0 positional arguments, making 227 | # Black error. 228 | # 229 | # `diff-filter=ACRM` and $MERGEBASE is to ensure we only format files that 230 | # exist on both branches. 231 | MERGEBASE="$(git merge-base upstream/main HEAD)" 232 | 233 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then 234 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ 235 | isort 236 | fi 237 | 238 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then 239 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ 240 | black "${BLACK_EXCLUDES[@]}" 241 | if which flake8 >/dev/null; then 242 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ 243 | flake8 --config=.flake8 244 | fi 245 | fi 246 | 247 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then 248 | if which flake8 >/dev/null; then 249 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \ 250 | flake8 --config=.flake8 "$FLAKE8_PYX_IGNORES" 251 | fi 252 | fi 253 | 254 | if which clang-format >/dev/null; then 255 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.cc' '*.h' &>/dev/null; then 256 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.cc' '*.h' | xargs -P 5 \ 257 | clang-format -i 258 | fi 259 | fi 260 | 261 | if command -v shellcheck >/dev/null; then 262 | local shell_files non_shell_files 263 | non_shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- ':(exclude)*.sh')) 264 | shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.sh')) 265 | if [ 0 -lt "${#non_shell_files[@]}" ]; then 266 | shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) 267 | fi 268 | if [ 0 -lt "${#shell_files[@]}" ]; then 269 | shellcheck_scripts "${shell_files[@]}" 270 | fi 271 | fi 272 | } 273 | 274 | # This flag formats individual files. --files *must* be the first command line 275 | # arg to use this option. 276 | if [ "${1-}" == '--files' ]; then 277 | format_files "${@:2}" 278 | # If `--all` or `--scripts` are passed, then any further arguments are ignored. 279 | # Format the entire python directory and other scripts. 280 | elif [ "${1-}" == '--all-scripts' ]; then 281 | format_all_scripts "${@}" 282 | if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi 283 | # Format the all Python, C++, Java and other script files. 284 | elif [ "${1-}" == '--all' ]; then 285 | format_all_scripts "${@}" 286 | if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi 287 | else 288 | # Add the upstream remote if it doesn't exist 289 | if ! git remote -v | grep -q upstream; then 290 | git remote add 'upstream' 'https://github.com/ray-project/lightgbm_ray.git' 291 | fi 292 | 293 | # Only fetch main since that's the branch we're diffing against. 294 | git fetch upstream main || true 295 | 296 | # Format only the files that changed in last commit. 297 | format_changed 298 | fi 299 | 300 | check_docstyle 301 | 302 | if ! git diff --quiet &>/dev/null; then 303 | echo 'Reformatted changed files. Please review and stage the changes.' 304 | echo 'Files updated:' 305 | echo 306 | 307 | git --no-pager diff --name-only 308 | 309 | exit 1 310 | fi 311 | -------------------------------------------------------------------------------- /lightgbm_ray/__init__.py: -------------------------------------------------------------------------------- 1 | from xgboost_ray.matrix import ( 2 | Data, 3 | RayDeviceQuantileDMatrix, 4 | RayDMatrix, 5 | RayFileType, 6 | RayShardingMode, 7 | combine_data, 8 | ) 9 | 10 | from lightgbm_ray.main import RayParams, predict, train 11 | from lightgbm_ray.sklearn import RayLGBMClassifier, RayLGBMRegressor 12 | 13 | __version__ = "0.1.10" 14 | 15 | __all__ = [ 16 | "__version__", 17 | "RayParams", 18 | "RayDMatrix", 19 | "RayDeviceQuantileDMatrix", 20 | "RayFileType", 21 | "RayShardingMode", 22 | "Data", 23 | "combine_data", 24 | "train", 25 | "predict", 26 | "RayLGBMClassifier", 27 | "RayLGBMRegressor", 28 | ] 29 | -------------------------------------------------------------------------------- /lightgbm_ray/callback.py: -------------------------------------------------------------------------------- 1 | from xgboost_ray.callback import ( 2 | DistributedCallback, 3 | DistributedCallbackContainer, 4 | EnvironmentCallback, 5 | ) 6 | 7 | __all__ = ["DistributedCallback", "DistributedCallbackContainer", "EnvironmentCallback"] 8 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ray-project/lightgbm_ray/4c4d3413f86db769bddb6d08e2480a04bc75d712/lightgbm_ray/examples/__init__.py -------------------------------------------------------------------------------- /lightgbm_ray/examples/create_test_data.py: -------------------------------------------------------------------------------- 1 | from xgboost_ray.tests.utils import create_parquet 2 | 3 | 4 | def main(): 5 | create_parquet( 6 | "example.parquet", 7 | num_rows=1_000_000, 8 | num_partitions=100, 9 | num_features=8, 10 | num_classes=2, 11 | ) 12 | 13 | 14 | if __name__ == "__main__": 15 | main() 16 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/higgs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from lightgbm_ray import RayDMatrix, RayParams, train 5 | 6 | FILENAME_CSV = "HIGGS.csv.gz" 7 | 8 | 9 | def download_higgs(target_file): 10 | url = ( 11 | "https://archive.ics.uci.edu/ml/machine-learning-databases/" 12 | "00280/HIGGS.csv.gz" 13 | ) 14 | 15 | try: 16 | import urllib.request 17 | except ImportError as e: 18 | raise ValueError( 19 | f"Automatic downloading of the HIGGS dataset requires `urllib`." 20 | f"\nFIX THIS by running `pip install urllib` or manually " 21 | f"downloading the dataset from {url}." 22 | ) from e 23 | 24 | print(f"Downloading HIGGS dataset to {target_file}") 25 | urllib.request.urlretrieve(url, target_file) 26 | return os.path.exists(target_file) 27 | 28 | 29 | def main(): 30 | # Example adapted from this blog post: 31 | # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7 32 | # This uses the HIGGS dataset. Download here: 33 | # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz 34 | 35 | if not os.path.exists(FILENAME_CSV): 36 | assert download_higgs(FILENAME_CSV), "Downloading of HIGGS dataset failed." 37 | print("HIGGS dataset downloaded.") 38 | else: 39 | print("HIGGS dataset found locally.") 40 | 41 | colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] 42 | 43 | dtrain = RayDMatrix(os.path.abspath(FILENAME_CSV), label="label", names=colnames) 44 | 45 | config = { 46 | "objective": "binary", 47 | "metric": ["binary_logloss", "binary_error"], 48 | } 49 | 50 | evals_result = {} 51 | 52 | start = time.time() 53 | bst = train( 54 | config, 55 | dtrain, 56 | evals_result=evals_result, 57 | ray_params=RayParams(max_actor_restarts=1, num_actors=2), 58 | num_boost_round=100, 59 | evals=[(dtrain, "train")], 60 | ) 61 | taken = time.time() - start 62 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 63 | 64 | bst.booster_.save_model("higgs.lgbm") 65 | print( 66 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 67 | ) 68 | 69 | 70 | if __name__ == "__main__": 71 | import ray 72 | 73 | ray.init() 74 | 75 | start = time.time() 76 | main() 77 | taken = time.time() - start 78 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 79 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/higgs_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import pandas as pd 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | 8 | from lightgbm_ray import RayDMatrix, RayParams, train 9 | 10 | from .higgs import download_higgs 11 | 12 | FILENAME_CSV = "HIGGS.csv.gz" 13 | FILENAME_PARQUET = "HIGGS.parquet" 14 | 15 | 16 | def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs): 17 | if os.path.exists(out_file): 18 | return False 19 | 20 | print(f"Converting CSV {in_file} to PARQUET {out_file}") 21 | csv_stream = pd.read_csv( 22 | in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs 23 | ) 24 | 25 | parquet_schema = None 26 | parquet_writer = None 27 | for i, chunk in enumerate(csv_stream): 28 | print("Chunk", i) 29 | if not parquet_schema: 30 | # Guess the schema of the CSV file from the first chunk 31 | parquet_schema = pa.Table.from_pandas(df=chunk).schema 32 | # Open a Parquet file for writing 33 | parquet_writer = pq.ParquetWriter( 34 | out_file, parquet_schema, compression="snappy" 35 | ) 36 | # Write CSV chunk to the parquet file 37 | table = pa.Table.from_pandas(chunk, schema=parquet_schema) 38 | parquet_writer.write_table(table) 39 | 40 | parquet_writer.close() 41 | return True 42 | 43 | 44 | def main(): 45 | # Example adapted from this blog post: 46 | # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7 47 | # This uses the HIGGS dataset. Download here: 48 | # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz 49 | 50 | if not os.path.exists(FILENAME_PARQUET): 51 | if not os.path.exists(FILENAME_CSV): 52 | download_higgs(FILENAME_CSV) 53 | print("Downloaded HIGGS csv dataset") 54 | print("Converting HIGGS csv dataset to parquet") 55 | csv_to_parquet( 56 | FILENAME_CSV, 57 | FILENAME_PARQUET, 58 | names=[ 59 | "label", 60 | "feature-01", 61 | "feature-02", 62 | "feature-03", 63 | "feature-04", 64 | "feature-05", 65 | "feature-06", 66 | "feature-07", 67 | "feature-08", 68 | "feature-09", 69 | "feature-10", 70 | "feature-11", 71 | "feature-12", 72 | "feature-13", 73 | "feature-14", 74 | "feature-15", 75 | "feature-16", 76 | "feature-17", 77 | "feature-18", 78 | "feature-19", 79 | "feature-20", 80 | "feature-21", 81 | "feature-22", 82 | "feature-23", 83 | "feature-24", 84 | "feature-25", 85 | "feature-26", 86 | "feature-27", 87 | "feature-28", 88 | ], 89 | ) 90 | 91 | colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] 92 | 93 | # Here we load the Parquet file 94 | dtrain = RayDMatrix( 95 | os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames 96 | ) 97 | 98 | config = { 99 | "objective": "binary", 100 | "metric": ["binary_logloss", "binary_error"], 101 | } 102 | 103 | evals_result = {} 104 | 105 | start = time.time() 106 | bst = train( 107 | config, 108 | dtrain, 109 | evals_result=evals_result, 110 | ray_params=RayParams(max_actor_restarts=1, num_actors=2), 111 | num_boost_round=100, 112 | evals=[(dtrain, "train")], 113 | ) 114 | taken = time.time() - start 115 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 116 | 117 | bst.booster_.save_model("higgs.lgbm") 118 | print( 119 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 120 | ) 121 | 122 | 123 | if __name__ == "__main__": 124 | import ray 125 | 126 | ray.init() 127 | 128 | start = time.time() 129 | main() 130 | taken = time.time() - start 131 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 132 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/readme.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa E501 2 | 3 | 4 | def readme_simple(): 5 | from sklearn.datasets import load_breast_cancer 6 | 7 | from lightgbm_ray import RayDMatrix, RayParams, train 8 | 9 | train_x, train_y = load_breast_cancer(return_X_y=True) 10 | train_set = RayDMatrix(train_x, train_y) 11 | 12 | evals_result = {} 13 | bst = train( 14 | { 15 | "objective": "binary", 16 | "metric": ["binary_logloss", "binary_error"], 17 | }, 18 | train_set, 19 | evals_result=evals_result, 20 | valid_sets=[train_set], 21 | valid_names=["train"], 22 | verbose_eval=False, 23 | ray_params=RayParams(num_actors=2, cpus_per_actor=2), 24 | ) 25 | 26 | bst.booster_.save_model("model.lgbm") 27 | print( 28 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 29 | ) 30 | 31 | 32 | def readme_predict(): 33 | import lightgbm as lgbm 34 | from sklearn.datasets import load_breast_cancer 35 | 36 | from lightgbm_ray import RayDMatrix, RayParams, predict 37 | 38 | data, labels = load_breast_cancer(return_X_y=True) 39 | 40 | dpred = RayDMatrix(data, labels) 41 | 42 | bst = lgbm.Booster(model_file="model.lgbm") 43 | pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2)) 44 | 45 | print(pred_ray) 46 | 47 | 48 | def readme_tune(): 49 | from sklearn.datasets import load_breast_cancer 50 | 51 | from lightgbm_ray import RayDMatrix, RayParams, train 52 | 53 | num_actors = 2 54 | num_cpus_per_actor = 2 55 | 56 | ray_params = RayParams(num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) 57 | 58 | def train_model(config): 59 | train_x, train_y = load_breast_cancer(return_X_y=True) 60 | train_set = RayDMatrix(train_x, train_y) 61 | 62 | evals_result = {} 63 | bst = train( 64 | params=config, 65 | dtrain=train_set, 66 | evals_result=evals_result, 67 | valid_sets=[train_set], 68 | valid_names=["train"], 69 | verbose_eval=False, 70 | ray_params=ray_params, 71 | ) 72 | bst.booster_.save_model("model.lgbm") 73 | 74 | from ray import tune 75 | 76 | # Specify the hyperparameter search space. 77 | config = { 78 | "objective": "binary", 79 | "metric": ["binary_logloss", "binary_error"], 80 | "eta": tune.loguniform(1e-4, 1e-1), 81 | "subsample": tune.uniform(0.5, 1.0), 82 | "max_depth": tune.randint(1, 9), 83 | } 84 | 85 | # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` 86 | analysis = tune.run( 87 | train_model, 88 | config=config, 89 | metric="train-binary_error", 90 | mode="min", 91 | num_samples=4, 92 | resources_per_trial=ray_params.get_tune_resources(), 93 | ) 94 | print("Best hyperparameters", analysis.best_config) 95 | 96 | 97 | if __name__ == "__main__": 98 | import ray 99 | 100 | ray.init(num_cpus=5) 101 | 102 | print("Readme: Simple example") 103 | readme_simple() 104 | readme_predict() 105 | try: 106 | print("Readme: Ray Tune example") 107 | readme_tune() 108 | except ImportError: 109 | print("Ray Tune not installed.") 110 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/readme_sklearn_api.py: -------------------------------------------------------------------------------- 1 | def readme_sklearn_api(): 2 | from sklearn.datasets import load_breast_cancer 3 | from sklearn.model_selection import train_test_split 4 | 5 | from lightgbm_ray import RayLGBMClassifier, RayParams 6 | 7 | seed = 42 8 | 9 | X, y = load_breast_cancer(return_X_y=True) 10 | X_train, X_test, y_train, y_test = train_test_split( 11 | X, y, train_size=0.25, random_state=42 12 | ) 13 | 14 | clf = RayLGBMClassifier( 15 | n_jobs=2, random_state=seed # In LightGBM-Ray, n_jobs sets the number of actors 16 | ) 17 | 18 | # scikit-learn API will automatically convert the data 19 | # to RayDMatrix format as needed. 20 | # You can also pass X as a RayDMatrix, in which case 21 | # y will be ignored. 22 | 23 | clf.fit(X_train, y_train) 24 | 25 | pred_ray = clf.predict(X_test) 26 | print(pred_ray) 27 | 28 | pred_proba_ray = clf.predict_proba(X_test) 29 | print(pred_proba_ray) 30 | 31 | # It is also possible to pass a RayParams object 32 | # to fit/predict/predict_proba methods - will override 33 | # n_jobs set during initialization 34 | 35 | clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2)) 36 | 37 | pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2)) 38 | print(pred_ray) 39 | 40 | 41 | if __name__ == "__main__": 42 | import ray 43 | 44 | ray.init(num_cpus=5) 45 | 46 | print("Readme: scikit-learn API example") 47 | readme_sklearn_api() 48 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/simple.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import ray 4 | from sklearn import datasets 5 | from sklearn.model_selection import train_test_split 6 | 7 | from lightgbm_ray import RayDMatrix, RayParams, train 8 | 9 | 10 | def main(cpus_per_actor, num_actors): 11 | # Load dataset 12 | data, labels = datasets.load_breast_cancer(return_X_y=True) 13 | # Split into train and test set 14 | train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) 15 | 16 | train_set = RayDMatrix(train_x, train_y) 17 | test_set = RayDMatrix(test_x, test_y) 18 | 19 | evals_result = {} 20 | 21 | # Set LGBM config. 22 | lightgbm_params = { 23 | "objective": "binary", 24 | "metric": ["binary_logloss", "binary_error"], 25 | } 26 | 27 | # Train the classifier 28 | bst = train( 29 | params=lightgbm_params, 30 | dtrain=train_set, 31 | valid_sets=[test_set], 32 | valid_names=["eval"], 33 | evals_result=evals_result, 34 | ray_params=RayParams( 35 | max_actor_restarts=0, 36 | gpus_per_actor=0, 37 | cpus_per_actor=cpus_per_actor, 38 | num_actors=num_actors, 39 | ), 40 | verbose_eval=False, 41 | num_boost_round=10, 42 | ) 43 | 44 | model_path = "simple.lgbm" 45 | bst.booster_.save_model(model_path) 46 | print( 47 | "Final validation error: {:.4f}".format( 48 | evals_result["eval"]["binary_error"][-1] 49 | ) 50 | ) 51 | 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument( 56 | "--address", required=False, type=str, help="the address to use for Ray" 57 | ) 58 | parser.add_argument( 59 | "--server-address", 60 | required=False, 61 | type=str, 62 | help="Address of the remote server if using Ray Client.", 63 | ) 64 | parser.add_argument( 65 | "--cpus-per-actor", 66 | type=int, 67 | default=2, 68 | help="Sets number of CPUs per lightgbm training worker.", 69 | ) 70 | parser.add_argument( 71 | "--num-actors", 72 | type=int, 73 | default=2, 74 | help="Sets number of lightgbm workers to use.", 75 | ) 76 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 77 | 78 | args, _ = parser.parse_known_args() 79 | 80 | if args.smoke_test: 81 | ray.init(num_cpus=args.num_actors * args.cpus_per_actor) 82 | elif args.server_address: 83 | ray.util.connect(args.server_address) 84 | else: 85 | ray.init(address=args.address) 86 | 87 | main(args.cpus_per_actor, args.num_actors) 88 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/simple_dask.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | from xgboost_ray.data_sources.dask import DASK_INSTALLED 7 | 8 | from lightgbm_ray import RayDMatrix, RayParams, train 9 | 10 | 11 | def main(cpus_per_actor, num_actors): 12 | if not DASK_INSTALLED: 13 | print("Dask is not installed. Install with `pip install dask`") 14 | return 15 | 16 | # Local import so the installation check comes first 17 | import dask 18 | import dask.dataframe as dd 19 | from ray.util.dask import ray_dask_get 20 | 21 | dask.config.set(scheduler=ray_dask_get) 22 | 23 | # Generate dataset 24 | x = np.repeat(range(8), 16).reshape((32, 4)) 25 | # Even numbers --> 0, odd numbers --> 1 26 | y = np.tile(np.repeat(range(2), 4), 4) 27 | 28 | # Flip some bits to reduce max accuracy 29 | bits_to_flip = np.random.choice(32, size=6, replace=False) 30 | y[bits_to_flip] = 1 - y[bits_to_flip] 31 | 32 | data = pd.DataFrame(x) 33 | data["label"] = y 34 | 35 | # Split into 4 partitions 36 | dask_df = dd.from_pandas(data, npartitions=4) 37 | 38 | train_set = RayDMatrix(dask_df, "label") 39 | 40 | evals_result = {} 41 | # Set XGBoost config. 42 | lightgbm_params = { 43 | "objective": "binary", 44 | "metric": ["binary_logloss", "binary_error"], 45 | } 46 | 47 | # Train the classifier 48 | bst = train( 49 | params=lightgbm_params, 50 | dtrain=train_set, 51 | valid_sets=[train_set], 52 | valid_names=["train"], 53 | evals_result=evals_result, 54 | ray_params=RayParams( 55 | max_actor_restarts=0, 56 | gpus_per_actor=0, 57 | cpus_per_actor=cpus_per_actor, 58 | num_actors=num_actors, 59 | ), 60 | verbose_eval=False, 61 | num_boost_round=10, 62 | ) 63 | 64 | model_path = "dask.lgbm" 65 | bst.booster_.save_model(model_path) 66 | print( 67 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 68 | ) 69 | 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument( 74 | "--address", required=False, type=str, help="the address to use for Ray" 75 | ) 76 | parser.add_argument( 77 | "--server-address", 78 | required=False, 79 | type=str, 80 | help="Address of the remote server if using Ray Client.", 81 | ) 82 | parser.add_argument( 83 | "--cpus-per-actor", 84 | type=int, 85 | default=2, 86 | help="Sets number of CPUs per lightgbm training worker.", 87 | ) 88 | parser.add_argument( 89 | "--num-actors", 90 | type=int, 91 | default=2, 92 | help="Sets number of lightgbm workers to use.", 93 | ) 94 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 95 | 96 | args, _ = parser.parse_known_args() 97 | 98 | if args.smoke_test: 99 | ray.init(num_cpus=args.num_actors * args.cpus_per_actor) 100 | elif args.server_address: 101 | ray.util.connect(args.server_address) 102 | else: 103 | ray.init(address=args.address) 104 | 105 | main(args.cpus_per_actor, args.num_actors) 106 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/simple_modin.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | from packaging.version import Version 7 | from sklearn.utils import shuffle 8 | from xgboost_ray.data_sources.modin import MODIN_INSTALLED 9 | 10 | from lightgbm_ray import RayDMatrix, RayParams, train 11 | 12 | 13 | def main(cpus_per_actor, num_actors): 14 | if not MODIN_INSTALLED: 15 | print( 16 | "Modin is not installed or installed in a version that is not " 17 | "compatible with lightgbm_ray (< 0.9.0)." 18 | ) 19 | return 20 | 21 | import modin 22 | 23 | if Version(modin.__version__) < Version("0.16.0") and Version( 24 | ray.__version__ 25 | ) >= Version("2.6.0"): 26 | print("modin<=0.16.0 is not compatible with ray>=2.6.0.") 27 | return 28 | 29 | # Import modin after initializing Ray 30 | from modin.distributed.dataframe.pandas import from_partitions 31 | 32 | # Generate dataset 33 | x = np.repeat(range(8), 16).reshape((32, 4)) 34 | # Even numbers --> 0, odd numbers --> 1 35 | y = np.tile(np.repeat(range(2), 4), 4) 36 | 37 | # Flip some bits to reduce max accuracy 38 | bits_to_flip = np.random.choice(32, size=6, replace=False) 39 | y[bits_to_flip] = 1 - y[bits_to_flip] 40 | 41 | # LightGBM requires well-shuffled data 42 | x, y = shuffle(x, y, random_state=1) 43 | 44 | data = pd.DataFrame(x) 45 | data["label"] = y 46 | 47 | # Split into 4 partitions 48 | partitions = [ray.put(part) for part in np.split(data, 4)] 49 | 50 | # Create modin df here 51 | modin_df = from_partitions(partitions, axis=0) 52 | 53 | train_set = RayDMatrix(modin_df, "label") 54 | 55 | evals_result = {} 56 | # Set LGBM config. 57 | lightgbm_params = { 58 | "objective": "binary", 59 | "metric": ["binary_logloss", "binary_error"], 60 | } 61 | 62 | # Train the classifier 63 | bst = train( 64 | params=lightgbm_params, 65 | dtrain=train_set, 66 | valid_sets=[train_set], 67 | valid_names=["train"], 68 | evals_result=evals_result, 69 | ray_params=RayParams( 70 | max_actor_restarts=0, 71 | gpus_per_actor=0, 72 | cpus_per_actor=cpus_per_actor, 73 | num_actors=num_actors, 74 | ), 75 | verbose_eval=False, 76 | num_boost_round=10, 77 | ) 78 | 79 | model_path = "modin.lgbm" 80 | bst.booster_.save_model(model_path) 81 | print( 82 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 83 | ) 84 | 85 | 86 | if __name__ == "__main__": 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument( 89 | "--address", required=False, type=str, help="the address to use for Ray" 90 | ) 91 | parser.add_argument( 92 | "--server-address", 93 | required=False, 94 | type=str, 95 | help="Address of the remote server if using Ray Client.", 96 | ) 97 | parser.add_argument( 98 | "--cpus-per-actor", 99 | type=int, 100 | default=2, 101 | help="Sets number of CPUs per lightgbm training worker.", 102 | ) 103 | parser.add_argument( 104 | "--num-actors", 105 | type=int, 106 | default=2, 107 | help="Sets number of lightgbm workers to use.", 108 | ) 109 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 110 | 111 | args, _ = parser.parse_known_args() 112 | 113 | if args.smoke_test: 114 | ray.init(num_cpus=(args.num_actors * args.cpus_per_actor) + 1) 115 | elif args.server_address: 116 | ray.util.connect(args.server_address) 117 | else: 118 | ray.init(address=args.address) 119 | 120 | main(args.cpus_per_actor, args.num_actors) 121 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/simple_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import lightgbm as lgbm 4 | import numpy as np 5 | from sklearn import datasets 6 | 7 | from lightgbm_ray import RayDMatrix, RayParams, predict 8 | 9 | 10 | def main(): 11 | if not os.path.exists("simple.lgbm"): 12 | raise ValueError( 13 | "Model file not found: `simple.lgbm`" 14 | "\nFIX THIS by running `python `simple.py` first to " 15 | "train the model." 16 | ) 17 | 18 | # Load dataset 19 | data, labels = datasets.load_breast_cancer(return_X_y=True) 20 | 21 | dmat_ray = RayDMatrix(data, labels) 22 | 23 | bst = lgbm.Booster(model_file="simple.lgbm") 24 | 25 | pred_lgbm = bst.predict(data) 26 | pred_ray = predict(bst, dmat_ray, ray_params=RayParams(num_actors=2)) 27 | 28 | np.testing.assert_array_equal(pred_lgbm, pred_ray) 29 | print(pred_ray) 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/simple_ray_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | from sklearn.utils import shuffle 7 | 8 | from lightgbm_ray import RayDMatrix, RayParams, train 9 | 10 | 11 | def main(cpus_per_actor, num_actors): 12 | # Generate dataset 13 | x = np.repeat(range(8), 16).reshape((32, 4)) 14 | # Even numbers --> 0, odd numbers --> 1 15 | y = np.tile(np.repeat(range(2), 4), 4) 16 | 17 | # Flip some bits to reduce max accuracy 18 | bits_to_flip = np.random.choice(32, size=6, replace=False) 19 | y[bits_to_flip] = 1 - y[bits_to_flip] 20 | 21 | # LightGBM requires well-shuffled data 22 | x, y = shuffle(x, y, random_state=1) 23 | 24 | data = pd.DataFrame(x) 25 | # Ray Datasets require all columns to be string 26 | data.columns = [str(c) for c in data.columns] 27 | data["label"] = y 28 | 29 | # There was recent API change - the first clause covers the new 30 | # and current Ray master API 31 | if hasattr(ray.data, "from_pandas_refs"): 32 | # Generate Ray dataset from 4 partitions 33 | ray_ds = ray.data.from_pandas(np.split(data, 4)) 34 | else: 35 | # Split into 4 partitions 36 | partitions = [ray.put(part) for part in np.split(data, 4)] 37 | ray_ds = ray.data.from_pandas(partitions) 38 | 39 | # Generate Ray dataset from 4 partitions 40 | ray_ds = ray.data.from_pandas(np.split(data, 4)) 41 | 42 | train_set = RayDMatrix(ray_ds, "label") 43 | 44 | evals_result = {} 45 | # Set LightGBM config. 46 | lightgbm_params = { 47 | "objective": "binary", 48 | "metric": ["binary_logloss", "binary_error"], 49 | } 50 | 51 | # Train the classifier 52 | bst = train( 53 | params=lightgbm_params, 54 | dtrain=train_set, 55 | valid_sets=[train_set], 56 | valid_names=["train"], 57 | evals_result=evals_result, 58 | ray_params=RayParams( 59 | max_actor_restarts=0, 60 | gpus_per_actor=0, 61 | cpus_per_actor=cpus_per_actor, 62 | num_actors=num_actors, 63 | ), 64 | verbose_eval=False, 65 | num_boost_round=10, 66 | ) 67 | 68 | model_path = "ray_datasets.lgbm" 69 | bst.booster_.save_model(model_path) 70 | print( 71 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 72 | ) 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser() 77 | parser.add_argument( 78 | "--address", required=False, type=str, help="the address to use for Ray" 79 | ) 80 | parser.add_argument( 81 | "--server-address", 82 | required=False, 83 | type=str, 84 | help="Address of the remote server if using Ray Client.", 85 | ) 86 | parser.add_argument( 87 | "--cpus-per-actor", 88 | type=int, 89 | default=2, 90 | help="Sets number of CPUs per lightgbm training worker.", 91 | ) 92 | parser.add_argument( 93 | "--num-actors", 94 | type=int, 95 | default=2, 96 | help="Sets number of lightgbm workers to use.", 97 | ) 98 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 99 | 100 | args, _ = parser.parse_known_args() 101 | 102 | if args.smoke_test: 103 | ray.init(num_cpus=(args.num_actors * args.cpus_per_actor) + 1) 104 | elif args.server_address: 105 | ray.util.connect(args.server_address) 106 | else: 107 | ray.init(address=args.address) 108 | 109 | main(args.cpus_per_actor, args.num_actors) 110 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/simple_tune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import ray 5 | from ray import tune 6 | from sklearn import datasets 7 | from sklearn.model_selection import train_test_split 8 | 9 | import lightgbm_ray 10 | from lightgbm_ray import RayDMatrix, RayParams, train 11 | 12 | 13 | def train_breast_cancer(config, ray_params): 14 | # Load dataset 15 | data, labels = datasets.load_breast_cancer(return_X_y=True) 16 | # Split into train and test set 17 | train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) 18 | 19 | train_set = RayDMatrix(train_x, train_y) 20 | test_set = RayDMatrix(test_x, test_y) 21 | 22 | evals_result = {} 23 | 24 | bst = train( 25 | params=config, 26 | dtrain=train_set, 27 | valid_sets=[test_set], 28 | valid_names=["eval"], 29 | evals_result=evals_result, 30 | ray_params=ray_params, 31 | verbose_eval=False, 32 | num_boost_round=10, 33 | ) 34 | 35 | model_path = "tuned.lgbm" 36 | bst.booster_.save_model(model_path) 37 | print( 38 | "Final validation error: {:.4f}".format( 39 | evals_result["eval"]["binary_error"][-1] 40 | ) 41 | ) 42 | 43 | 44 | def main(cpus_per_actor, num_actors, num_samples): 45 | # Set LightGBM config. 46 | config = { 47 | "objective": "binary", 48 | "metric": ["binary_logloss", "binary_error"], 49 | "eta": tune.loguniform(1e-4, 1e-1), 50 | "subsample": tune.uniform(0.5, 1.0), 51 | "max_depth": tune.randint(1, 9), 52 | } 53 | 54 | ray_params = RayParams( 55 | max_actor_restarts=1, 56 | gpus_per_actor=0, 57 | cpus_per_actor=cpus_per_actor, 58 | num_actors=num_actors, 59 | ) 60 | 61 | analysis = tune.run( 62 | tune.with_parameters(train_breast_cancer, ray_params=ray_params), 63 | # Use the `get_tune_resources` helper function to set the resources. 64 | resources_per_trial=ray_params.get_tune_resources(), 65 | config=config, 66 | num_samples=num_samples, 67 | metric="eval-binary_error", 68 | mode="min", 69 | ) 70 | 71 | # Load the best model checkpoint. 72 | best_bst = lightgbm_ray.tune.load_model( 73 | os.path.join(analysis.best_trial.local_path, "tuned.lgbm") 74 | ) 75 | 76 | best_bst.save_model("best_model.lgbm") 77 | 78 | accuracy = 1.0 - analysis.best_result["eval-binary_error"] 79 | print(f"Best model parameters: {analysis.best_config}") 80 | print(f"Best model total accuracy: {accuracy:.4f}") 81 | 82 | 83 | if __name__ == "__main__": 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument( 86 | "--address", required=False, type=str, help="the address to use for Ray" 87 | ) 88 | parser.add_argument( 89 | "--server-address", 90 | required=False, 91 | type=str, 92 | help="Address of the remote server if using Ray Client.", 93 | ) 94 | parser.add_argument( 95 | "--cpus-per-actor", 96 | type=int, 97 | default=2, 98 | help="Sets number of CPUs per LightGBM training worker.", 99 | ) 100 | parser.add_argument( 101 | "--num-actors", 102 | type=int, 103 | default=2, 104 | help="Sets number of LightGBM workers to use.", 105 | ) 106 | parser.add_argument( 107 | "--num-samples", type=int, default=4, help="Number of samples to use for Tune." 108 | ) 109 | parser.add_argument("--smoke-test", action="store_true", default=False) 110 | 111 | args, _ = parser.parse_known_args() 112 | 113 | if args.smoke_test: 114 | ray.init(num_cpus=args.num_actors * max(args.num_samples, args.cpus_per_actor)) 115 | elif args.server_address: 116 | ray.util.connect(args.server_address) 117 | else: 118 | ray.init(address=args.address) 119 | 120 | main(args.cpus_per_actor, args.num_actors, args.num_samples) 121 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/train_on_test_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time 5 | 6 | from xgboost_ray.tests.utils import create_parquet_in_tempdir 7 | 8 | from lightgbm_ray import RayDMatrix, RayParams, train 9 | 10 | #### 11 | # Run `create_test_data.py` first to create a large fake data set. 12 | # Alternatively, run with `--smoke-test` to create an ephemeral small fake 13 | # data set. 14 | #### 15 | 16 | 17 | def main(fname, num_actors=2): 18 | dtrain = RayDMatrix(os.path.abspath(fname), label="labels", ignore=["partition"]) 19 | 20 | config = { 21 | "objective": "binary", 22 | "metric": ["binary_logloss", "binary_error"], 23 | } 24 | 25 | evals_result = {} 26 | 27 | start = time.time() 28 | bst = train( 29 | config, 30 | dtrain, 31 | evals_result=evals_result, 32 | ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), 33 | num_boost_round=10, 34 | evals=[(dtrain, "train")], 35 | ) 36 | taken = time.time() - start 37 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 38 | 39 | bst.booster_.save_model("test_data.lgbm") 40 | print( 41 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 42 | ) 43 | 44 | 45 | if __name__ == "__main__": 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument( 48 | "--smoke-test", 49 | action="store_true", 50 | default=False, 51 | help="Finish quickly for testing", 52 | ) 53 | args = parser.parse_args() 54 | 55 | temp_dir, path = None, None 56 | if args.smoke_test: 57 | temp_dir, path = create_parquet_in_tempdir( 58 | "smoketest.parquet", 59 | num_rows=1_000, 60 | num_features=4, 61 | num_classes=2, 62 | num_partitions=2, 63 | ) 64 | else: 65 | path = os.path.join(os.path.dirname(__file__), "parted.parquet") 66 | 67 | import ray 68 | 69 | ray.init(num_cpus=5) 70 | 71 | start = time.time() 72 | main(path) 73 | taken = time.time() - start 74 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 75 | 76 | if args.smoke_test: 77 | shutil.rmtree(temp_dir) 78 | -------------------------------------------------------------------------------- /lightgbm_ray/examples/train_with_ml_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time 5 | 6 | from ray.util.data import read_parquet 7 | from xgboost_ray.tests.utils import create_parquet_in_tempdir 8 | 9 | from lightgbm_ray import RayDMatrix, RayParams, train 10 | 11 | #### 12 | # Run `create_test_data.py` first to create a large fake data set. 13 | # Alternatively, run with `--smoke-test` to create an ephemeral small fake 14 | # data set. 15 | #### 16 | 17 | 18 | def main(fname, num_actors=2): 19 | ml_dataset = read_parquet(fname, num_shards=num_actors) 20 | 21 | dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"]) 22 | 23 | config = { 24 | "objective": "binary", 25 | "metric": ["binary_logloss", "binary_error"], 26 | } 27 | 28 | evals_result = {} 29 | 30 | start = time.time() 31 | bst = train( 32 | config, 33 | dtrain, 34 | evals_result=evals_result, 35 | ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), 36 | num_boost_round=10, 37 | evals=[(dtrain, "train")], 38 | ) 39 | taken = time.time() - start 40 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 41 | 42 | bst.save_model("test_data.lgbm") 43 | print( 44 | "Final training error: {:.4f}".format(evals_result["train"]["binary_error"][-1]) 45 | ) 46 | 47 | 48 | if __name__ == "__main__": 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( 51 | "--smoke-test", 52 | action="store_true", 53 | default=False, 54 | help="Finish quickly for testing", 55 | ) 56 | args = parser.parse_args() 57 | 58 | temp_dir, path = None, None 59 | if args.smoke_test: 60 | temp_dir, path = create_parquet_in_tempdir( 61 | "smoketest.parquet", 62 | num_rows=1_000, 63 | num_features=4, 64 | num_classes=2, 65 | num_partitions=2, 66 | ) 67 | else: 68 | path = os.path.join(os.path.dirname(__file__), "parted.parquet") 69 | 70 | import ray 71 | 72 | ray.init() 73 | 74 | start = time.time() 75 | main(path) 76 | taken = time.time() - start 77 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 78 | 79 | if args.smoke_test: 80 | shutil.rmtree(temp_dir) 81 | -------------------------------------------------------------------------------- /lightgbm_ray/sklearn.py: -------------------------------------------------------------------------------- 1 | """scikit-learn wrapper for lightgbm-ray. Based on lightgbm.dask.""" 2 | 3 | # Portions of code used in this file and implementation logic are based 4 | # on lightgbm.dask. 5 | # https://github.com/microsoft/LightGBM/blob/b5502d19b2b462f665e3d1edbaa70c0d6472bca4/python-package/lightgbm/dask.py 6 | 7 | # The MIT License (MIT) 8 | 9 | # Copyright (c) Microsoft Corporation 10 | 11 | # Permission is hereby granted, free of charge, to any person obtaining a copy 12 | # of this software and associated documentation files (the "Software"), to deal 13 | # in the Software without restriction, including without limitation the rights 14 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | # copies of the Software, and to permit persons to whom the Software is 16 | # furnished to do so, subject to the following conditions: 17 | 18 | # The above copyright notice and this permission notice shall be included in 19 | # all copies or substantial portions of the Software. 20 | 21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | # SOFTWARE. 28 | 29 | # License: 30 | # https://github.com/microsoft/LightGBM/blob/c3b9363d02564625332583e166e3ab3135f436e3/LICENSE 31 | 32 | import logging 33 | import warnings 34 | from typing import Any, Callable, Dict, List, Optional, Type, Union 35 | 36 | from lightgbm import LGBMClassifier, LGBMModel, LGBMRegressor # LGBMRanker 37 | from lightgbm.basic import _choose_param_value, _ConfigAliases 38 | from ray.util.annotations import PublicAPI 39 | from xgboost_ray.sklearn import ( 40 | RayXGBMixin, 41 | _check_if_params_are_ray_dmatrix, 42 | _wrap_evaluation_matrices, 43 | ) 44 | 45 | from lightgbm_ray.main import RayDMatrix, RayParams, predict, train 46 | 47 | logger = logging.getLogger(__name__) 48 | 49 | _RAY_PARAMS_DOC = """ 50 | ray_params : RayParams or dict, optional (default=None) 51 | Parameters to configure Ray-specific behavior. 52 | See :class:`RayParams` for a list of valid configuration parameters. 53 | Will override ``n_jobs`` attribute with own ``num_actors`` parameter. 54 | _remote : bool, optional (default=False) 55 | Whether to run the driver process in a remote function. 56 | This is enabled by default in Ray client mode. 57 | ray_dmatrix_params : dict, optional (default=None) 58 | Dict of parameters (such as sharding mode) passed to the internal 59 | RayDMatrix initialization.""" 60 | 61 | _N_JOBS_DOC_REPLACE = ( 62 | """ n_jobs : int, optional (default=-1) 63 | Number of parallel threads.""", # noqa: E501, W291 64 | """ n_jobs : int, optional (default=1) 65 | Number of Ray actors used to run LightGBM in parallel. 66 | In order to set number of threads per actor, pass a :class:`RayParams` 67 | object to the relevant method as a ``ray_params`` argument. Will be 68 | overriden by the ``num_actors`` parameter of ``ray_params`` argument 69 | should it be passed to a method.""", # noqa: E501, W291 70 | ) 71 | 72 | 73 | def _treat_estimator_doc(doc: str) -> str: 74 | """Helper function to make nececssary changes in estimator docstrings""" 75 | doc = doc.replace(*_N_JOBS_DOC_REPLACE).replace( 76 | "Construct a gradient boosting model.", 77 | "Construct a gradient boosting model distributed on Ray.", 78 | ) 79 | return doc 80 | 81 | 82 | def _treat_method_doc(doc: str, insert_before: str) -> str: 83 | """Helper function to make changes in estimator method docstrings""" 84 | doc = ( 85 | doc[: doc.find(insert_before)] 86 | + _RAY_PARAMS_DOC 87 | + doc[doc.find(insert_before) :] 88 | ) 89 | return doc 90 | 91 | 92 | class _RayLGBMModel(RayXGBMixin): 93 | def _ray_get_wrap_evaluation_matrices_compat_kwargs( 94 | self, label_transform=None 95 | ) -> dict: 96 | self.enable_categorical = False 97 | self.feature_types = None 98 | return super()._ray_get_wrap_evaluation_matrices_compat_kwargs( 99 | label_transform=label_transform 100 | ) 101 | 102 | def _ray_set_ray_params_n_jobs( 103 | self, ray_params: Optional[Union[RayParams, dict]], n_jobs: Optional[int] 104 | ) -> RayParams: 105 | """Helper function to set num_actors in ray_params if not 106 | set by the user""" 107 | if ray_params is None: 108 | if not n_jobs or n_jobs < 1: 109 | n_jobs = 1 110 | ray_params = RayParams(num_actors=n_jobs) 111 | elif n_jobs is not None: 112 | warnings.warn( 113 | "`ray_params` is not `None` and will override " 114 | "the `n_jobs` attribute." 115 | ) 116 | return ray_params 117 | 118 | def _ray_fit( 119 | self, 120 | model_factory: Type[LGBMModel], 121 | X, 122 | y, 123 | sample_weight=None, 124 | init_score=None, 125 | group=None, 126 | eval_set=None, 127 | eval_names: Optional[List[str]] = None, 128 | eval_sample_weight=None, 129 | eval_init_score=None, 130 | eval_group=None, 131 | eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, 132 | ray_params: Union[None, RayParams, Dict] = None, 133 | _remote: Optional[bool] = None, 134 | ray_dmatrix_params: Optional[Dict] = None, 135 | **kwargs: Any, 136 | ) -> "_RayLGBMModel": 137 | 138 | params = self.get_params(True) 139 | 140 | ray_params = self._ray_set_ray_params_n_jobs(ray_params, params["n_jobs"]) 141 | 142 | params = _choose_param_value( 143 | main_param_name="n_estimators", params=params, default_value=100 144 | ) 145 | 146 | num_boosting_round = params.pop("n_estimators") 147 | ray_dmatrix_params = ray_dmatrix_params or {} 148 | 149 | train_dmatrix, evals = _check_if_params_are_ray_dmatrix( 150 | X, sample_weight, init_score, eval_set, eval_sample_weight, eval_init_score 151 | ) 152 | 153 | if train_dmatrix is None: 154 | train_dmatrix, evals = _wrap_evaluation_matrices( 155 | missing=None, 156 | X=X, 157 | y=y, 158 | group=group, 159 | qid=None, 160 | sample_weight=sample_weight, 161 | base_margin=init_score, 162 | feature_weights=None, 163 | eval_set=eval_set, 164 | sample_weight_eval_set=eval_sample_weight, 165 | base_margin_eval_set=eval_init_score, 166 | eval_group=eval_group, 167 | eval_qid=None, 168 | # changed in xgboost-ray: 169 | create_dmatrix=lambda **kwargs: RayDMatrix( 170 | **{ 171 | **kwargs, 172 | **ray_dmatrix_params, 173 | } 174 | ), 175 | **self._ray_get_wrap_evaluation_matrices_compat_kwargs(), 176 | ) 177 | 178 | eval_names = eval_names or [] 179 | 180 | for i, _ in enumerate(evals): 181 | if len(eval_names) > i: 182 | evals[i] = (evals[i][0], eval_names[i]) 183 | else: 184 | # _wrap_evaluation_matrices sets default names to 185 | # `validation_`, but lgbm uses `valid_`, so 186 | # we fix that here 187 | evals[i] = (evals[i][0], f"valid_{i}") 188 | 189 | for param in _ConfigAliases.get("n_jobs"): 190 | params.pop(param, None) 191 | 192 | model = train( 193 | dtrain=train_dmatrix, 194 | num_boost_round=num_boosting_round, 195 | params=params, 196 | model_factory=model_factory, 197 | evals=evals, 198 | eval_metric=eval_metric, 199 | ray_params=ray_params, 200 | _remote=_remote, 201 | **kwargs, 202 | ) 203 | 204 | self.set_params(**model.get_params()) 205 | self._lgb_ray_copy_extra_params(model, self) 206 | 207 | return self 208 | 209 | def _ray_predict( 210 | self, 211 | X, 212 | model_factory: Type[LGBMModel], 213 | *, 214 | method: str = "predict", 215 | ray_params: Union[None, RayParams, Dict] = None, 216 | _remote: Optional[bool] = None, 217 | ray_dmatrix_params: Optional[Dict], 218 | **kwargs, 219 | ): 220 | params = self.get_params(True) 221 | ray_params = self._ray_set_ray_params_n_jobs(ray_params, params["n_jobs"]) 222 | 223 | ray_dmatrix_params = ray_dmatrix_params or {} 224 | if not isinstance(X, RayDMatrix): 225 | test = RayDMatrix(X, **ray_dmatrix_params) 226 | else: 227 | test = X 228 | return predict( 229 | self._lgb_ray_to_local(model_factory), 230 | data=test, 231 | method=method, 232 | ray_params=ray_params, 233 | _remote=_remote, 234 | **kwargs, 235 | ) 236 | 237 | def _lgb_ray_to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel: 238 | params = self.get_params() 239 | model = model_factory(**params) 240 | self._lgb_ray_copy_extra_params(self, model) 241 | return model 242 | 243 | @staticmethod 244 | def _lgb_ray_copy_extra_params( 245 | source: Union["_RayLGBMModel", LGBMModel], 246 | dest: Union["_RayLGBMModel", LGBMModel], 247 | ) -> None: 248 | params = source.get_params() 249 | attributes = source.__dict__ 250 | extra_param_names = set(attributes.keys()).difference(params.keys()) 251 | for name in extra_param_names: 252 | setattr(dest, name, attributes[name]) 253 | 254 | 255 | @PublicAPI(stability="beta") 256 | class RayLGBMClassifier(LGBMClassifier, _RayLGBMModel): 257 | def fit( 258 | self, 259 | X, 260 | y, 261 | sample_weight=None, 262 | init_score=None, 263 | eval_set=None, 264 | eval_names: Optional[List[str]] = None, 265 | eval_sample_weight=None, 266 | eval_class_weight: Optional[List[Union[dict, str]]] = None, 267 | eval_init_score=None, 268 | eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, 269 | ray_params: Union[None, RayParams, Dict] = None, 270 | _remote: Optional[bool] = None, 271 | ray_dmatrix_params: Optional[Dict] = None, 272 | **kwargs: Any, 273 | ) -> "RayLGBMClassifier": 274 | return self._ray_fit( 275 | model_factory=LGBMClassifier, 276 | X=X, 277 | y=y, 278 | sample_weight=sample_weight, 279 | init_score=init_score, 280 | eval_set=eval_set, 281 | eval_names=eval_names, 282 | eval_sample_weight=eval_sample_weight, 283 | eval_class_weight=eval_class_weight, 284 | eval_init_score=eval_init_score, 285 | eval_metric=eval_metric, 286 | ray_params=ray_params, 287 | _remote=_remote, 288 | ray_dmatrix_params=ray_dmatrix_params, 289 | **kwargs, 290 | ) 291 | 292 | fit.__doc__ = _treat_method_doc(LGBMClassifier.fit.__doc__, "\n\n Returns") 293 | 294 | def predict_proba( 295 | self, 296 | X, 297 | *, 298 | ray_params: Union[None, RayParams, Dict] = None, 299 | _remote: Optional[bool] = None, 300 | ray_dmatrix_params: Optional[Dict] = None, 301 | **kwargs, 302 | ): 303 | return self._ray_predict( 304 | X, 305 | model_factory=LGBMClassifier, 306 | method="predict_proba", 307 | ray_params=ray_params, 308 | _remote=_remote, 309 | ray_dmatrix_params=ray_dmatrix_params, 310 | **kwargs, 311 | ) 312 | 313 | predict_proba.__doc__ = _treat_method_doc( 314 | LGBMClassifier.predict_proba.__doc__, "\n **kwargs" 315 | ) 316 | 317 | def predict( 318 | self, 319 | X, 320 | *, 321 | ray_params: Union[None, RayParams, Dict] = None, 322 | _remote: Optional[bool] = None, 323 | ray_dmatrix_params: Optional[Dict] = None, 324 | **kwargs, 325 | ): 326 | return self._ray_predict( 327 | X, 328 | model_factory=LGBMClassifier, 329 | method="predict", 330 | ray_params=ray_params, 331 | _remote=_remote, 332 | ray_dmatrix_params=ray_dmatrix_params, 333 | **kwargs, 334 | ) 335 | 336 | predict.__doc__ = _treat_method_doc( 337 | LGBMClassifier.predict.__doc__, "\n **kwargs" 338 | ) 339 | 340 | def to_local(self) -> LGBMClassifier: 341 | """Create regular version of lightgbm.LGBMClassifier from the 342 | distributed version. 343 | 344 | Returns 345 | ------- 346 | model : lightgbm.LGBMClassifier 347 | Local underlying model. 348 | """ 349 | return self._lgb_ray_to_local(LGBMClassifier) 350 | 351 | 352 | RayLGBMClassifier.__init__.__doc__ = _treat_estimator_doc( 353 | LGBMClassifier.__init__.__doc__ 354 | ) 355 | 356 | 357 | @PublicAPI(stability="beta") 358 | class RayLGBMRegressor(LGBMRegressor, _RayLGBMModel): 359 | def fit( 360 | self, 361 | X, 362 | y, 363 | sample_weight=None, 364 | init_score=None, 365 | eval_set=None, 366 | eval_names: Optional[List[str]] = None, 367 | eval_sample_weight=None, 368 | eval_init_score=None, 369 | eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, 370 | ray_params: Union[None, RayParams, Dict] = None, 371 | _remote: Optional[bool] = None, 372 | ray_dmatrix_params: Optional[Dict] = None, 373 | **kwargs: Any, 374 | ) -> "RayLGBMRegressor": 375 | return self._ray_fit( 376 | model_factory=LGBMRegressor, 377 | X=X, 378 | y=y, 379 | sample_weight=sample_weight, 380 | init_score=init_score, 381 | eval_set=eval_set, 382 | eval_names=eval_names, 383 | eval_sample_weight=eval_sample_weight, 384 | eval_init_score=eval_init_score, 385 | eval_metric=eval_metric, 386 | ray_params=ray_params, 387 | _remote=_remote, 388 | ray_dmatrix_params=ray_dmatrix_params, 389 | **kwargs, 390 | ) 391 | 392 | fit.__doc__ = _treat_method_doc(LGBMRegressor.fit.__doc__, "\n\n Returns") 393 | 394 | def predict( 395 | self, 396 | X, 397 | *, 398 | ray_params: Union[None, RayParams, Dict] = None, 399 | _remote: Optional[bool] = None, 400 | ray_dmatrix_params: Optional[Dict] = None, 401 | **kwargs, 402 | ): 403 | return self._ray_predict( 404 | X, 405 | model_factory=LGBMRegressor, 406 | method="predict", 407 | ray_params=ray_params, 408 | _remote=_remote, 409 | ray_dmatrix_params=ray_dmatrix_params, 410 | **kwargs, 411 | ) 412 | 413 | predict.__doc__ = _treat_method_doc(LGBMRegressor.predict.__doc__, "\n **kwargs") 414 | 415 | def to_local(self) -> LGBMRegressor: 416 | """Create regular version of lightgbm.LGBMRegressor from the 417 | distributed version. 418 | 419 | Returns 420 | ------- 421 | model : lightgbm.LGBMRegressor 422 | Local underlying model. 423 | """ 424 | return self._lgb_ray_to_local(LGBMRegressor) 425 | 426 | 427 | RayLGBMRegressor.__init__.__doc__ = _treat_estimator_doc( 428 | RayLGBMRegressor.__init__.__doc__ 429 | ) 430 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ray-project/lightgbm_ray/4c4d3413f86db769bddb6d08e2480a04bc75d712/lightgbm_ray/tests/__init__.py -------------------------------------------------------------------------------- /lightgbm_ray/tests/env_info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2005 3 | 4 | echo "Test environment information" 5 | echo "----------------------------" 6 | echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')" 7 | echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')" 8 | echo "Installed pip packages:" 9 | echo "$(python -m pip freeze 2>/dev/null || echo 'Pip not installed')" 10 | echo "----------------------------" 11 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/benchmark_cpu_gpu.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | import time 6 | 7 | import ray 8 | from xgboost_ray.tests.utils import create_parquet_in_tempdir 9 | 10 | from lightgbm_ray import ( 11 | RayDeviceQuantileDMatrix, 12 | RayDMatrix, 13 | RayFileType, 14 | RayParams, 15 | train, 16 | ) 17 | 18 | if "OMP_NUM_THREADS" in os.environ: 19 | del os.environ["OMP_NUM_THREADS"] 20 | 21 | 22 | def train_ray( 23 | path, 24 | num_workers, 25 | num_boost_rounds, 26 | num_files=0, 27 | regression=False, 28 | use_gpu=False, 29 | smoke_test=False, 30 | ray_params=None, 31 | lightgbm_params=None, 32 | **kwargs, 33 | ): 34 | if num_files: 35 | files = sorted(glob.glob(f"{path}/**/*.parquet")) 36 | while num_files > len(files): 37 | files = files + files 38 | path = files[0:num_files] 39 | 40 | use_device_matrix = False 41 | 42 | if use_device_matrix: 43 | dtrain = RayDeviceQuantileDMatrix( 44 | path, 45 | num_actors=num_workers, 46 | label="labels", 47 | ignore=["partition"], 48 | filetype=RayFileType.PARQUET, 49 | ) 50 | else: 51 | dtrain = RayDMatrix( 52 | path, 53 | num_actors=num_workers, 54 | label="labels", 55 | ignore=["partition"], 56 | filetype=RayFileType.PARQUET, 57 | ) 58 | 59 | config = lightgbm_params or {"device": "cpu" if not use_gpu else "gpu"} 60 | if not regression: 61 | # Classification 62 | config.update( 63 | { 64 | "objective": "binary", 65 | "metric": ["binary_logloss", "binary_error"], 66 | } 67 | ) 68 | else: 69 | # Regression 70 | config.update( 71 | { 72 | "objective": "regression", 73 | "metric": ["l2", "rmse"], 74 | } 75 | ) 76 | 77 | start = time.time() 78 | evals_result = {} 79 | bst = train( 80 | config, 81 | dtrain, 82 | evals_result=evals_result, 83 | num_boost_round=num_boost_rounds, 84 | ray_params=ray_params 85 | or RayParams( 86 | max_actor_restarts=2, 87 | num_actors=num_workers, 88 | cpus_per_actor=4 if not smoke_test else 2, 89 | gpus_per_actor=0 if not use_gpu else 1, 90 | ), 91 | evals=[(dtrain, "train")], 92 | **kwargs, 93 | ) 94 | taken = time.time() - start 95 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 96 | 97 | bst.booster_.save_model("benchmark_{}.lgbm".format("cpu" if not use_gpu else "gpu")) 98 | print( 99 | "Final training error: {:.4f}".format( 100 | evals_result["train"]["binary_error" if not regression else "l2"][-1] 101 | ) 102 | ) 103 | return bst, taken 104 | 105 | 106 | if __name__ == "__main__": 107 | parser = argparse.ArgumentParser(description="Process some integers.") 108 | 109 | parser.add_argument("num_workers", type=int, help="num workers") 110 | parser.add_argument("num_rounds", type=int, help="num boost rounds") 111 | parser.add_argument("num_files", type=int, help="num files") 112 | 113 | parser.add_argument( 114 | "--file", default="/data/parted.parquet", type=str, help="data file" 115 | ) 116 | 117 | parser.add_argument( 118 | "--regression", action="store_true", default=False, help="regression" 119 | ) 120 | 121 | parser.add_argument("--gpu", action="store_true", default=False, help="gpu") 122 | 123 | parser.add_argument( 124 | "--smoke-test", action="store_true", default=False, help="smoke test" 125 | ) 126 | 127 | args = parser.parse_args() 128 | 129 | num_workers = args.num_workers 130 | num_boost_rounds = args.num_rounds 131 | num_files = args.num_files 132 | use_gpu = args.gpu 133 | 134 | temp_dir = None 135 | if args.smoke_test: 136 | temp_dir, path = create_parquet_in_tempdir( 137 | filename="smoketest.parquet", 138 | num_rows=args.num_workers * 500, 139 | num_features=4, 140 | num_classes=2, 141 | num_partitions=args.num_workers * 10, 142 | ) 143 | use_gpu = False 144 | else: 145 | path = args.file 146 | if not os.path.exists(path): 147 | raise ValueError( 148 | f"Benchmarking data not found: {path}." 149 | f"\nFIX THIS by running `python create_test_data.py` first." 150 | ) 151 | 152 | init_start = time.time() 153 | if args.smoke_test: 154 | ray.init(num_cpus=num_workers) 155 | else: 156 | ray.init(address="auto") 157 | init_taken = time.time() - init_start 158 | 159 | full_start = time.time() 160 | bst, train_taken = train_ray( 161 | path=path, 162 | num_workers=num_workers, 163 | num_boost_rounds=num_boost_rounds, 164 | num_files=num_files, 165 | regression=args.regression, 166 | use_gpu=use_gpu, 167 | smoke_test=args.smoke_test, 168 | ) 169 | full_taken = time.time() - full_start 170 | print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)") 171 | 172 | if args.smoke_test: 173 | shutil.rmtree(temp_dir, ignore_errors=True) 174 | else: 175 | with open("res.csv", "at") as fp: 176 | fp.writelines( 177 | [ 178 | ",".join( 179 | [ 180 | str(e) 181 | for e in [ 182 | num_workers, 183 | num_files, 184 | int(use_gpu), 185 | num_boost_rounds, 186 | init_taken, 187 | full_taken, 188 | train_taken, 189 | ] 190 | ] 191 | ) 192 | + "\n" 193 | ] 194 | ) 195 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/cluster_cpu.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: lightgbm_ray_release_tests_cpu_{{env["NUM_WORKERS"] | default(0)}} 2 | 3 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 4 | upscaling_speed: 9999 5 | 6 | idle_timeout_minutes: 15 7 | 8 | docker: 9 | image: anyscale/ray:nightly 10 | container_name: ray_container 11 | pull_before_run: true 12 | run_options: 13 | - --privileged 14 | 15 | provider: 16 | type: aws 17 | region: us-west-2 18 | availability_zone: us-west-2a 19 | cache_stopped_nodes: false 20 | 21 | available_node_types: 22 | cpu_4_ondemand: 23 | node_config: 24 | InstanceType: m5.xlarge 25 | resources: {"CPU": 4} 26 | min_workers: {{env["NUM_WORKERS"] | default(0)}} 27 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 28 | 29 | auth: 30 | ssh_user: ubuntu 31 | 32 | head_node_type: cpu_4_ondemand 33 | worker_default_node_type: cpu_4_ondemand 34 | 35 | file_mounts_sync_continuously: false 36 | 37 | setup_commands: 38 | - pip install -U {{env["RAY_WHEEL"] | default("ray")}} 39 | - pip install dask pytest 40 | - pip install -U {{env["LIGHTGBM_RAY_PACKAGE"] | default("lightgbm_ray")}} 41 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/cluster_gpu.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: lightgbm_ray_release_tests_gpu_{{env["NUM_WORKERS"] | default(0)}} 2 | 3 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 4 | upscaling_speed: 9999 5 | 6 | idle_timeout_minutes: 15 7 | 8 | docker: 9 | image: anyscale/ray:nightly-gpu 10 | container_name: ray_container 11 | pull_before_run: true 12 | run_options: 13 | - --privileged 14 | 15 | provider: 16 | type: aws 17 | region: us-west-2 18 | availability_zone: us-west-2a 19 | cache_stopped_nodes: false 20 | 21 | available_node_types: 22 | gpu_4_ondemand: 23 | node_config: 24 | InstanceType: p2.xlarge 25 | resources: {"CPU": 4, "GPU": 1} 26 | min_workers: {{env["NUM_WORKERS"] | default(0)}} 27 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 28 | 29 | auth: 30 | ssh_user: ubuntu 31 | 32 | head_node_type: gpu_4_ondemand 33 | worker_default_node_type: gpu_4_ondemand 34 | 35 | file_mounts: { 36 | "~/lightgbm_tests": "." 37 | } 38 | 39 | file_mounts_sync_continuously: false 40 | 41 | setup_commands: 42 | - pip uninstall -y lightgbm && pip install -U "lightgbm>=3.2.1" --install-option=--gpu 43 | - pip install -U pyarrow cupy-cuda101 44 | - pip install -U {{env["RAY_WHEEL"] | default("ray")}} 45 | - export LIGHTGBM_RAY_PACKAGE="{{env["LIGHTGBM_RAY_PACKAGE"] | default("lightgbm_ray")}}" && /bin/bash ~/lightgbm_tests/setup_lightgbm.sh 46 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/create_learnable_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.datasets import make_classification, make_regression 7 | 8 | if __name__ == "__main__": 9 | if "OMP_NUM_THREADS" in os.environ: 10 | del os.environ["OMP_NUM_THREADS"] 11 | 12 | parser = argparse.ArgumentParser(description="Create fake data.") 13 | parser.add_argument("filename", type=str, default="/data/parted.parquet/") 14 | parser.add_argument( 15 | "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" 16 | ) 17 | parser.add_argument( 18 | "-p", 19 | "--num-partitions", 20 | required=False, 21 | type=int, 22 | default=100, 23 | help="num partitions", 24 | ) 25 | parser.add_argument( 26 | "-c", 27 | "--num-cols", 28 | required=False, 29 | type=int, 30 | default=4, 31 | help="num columns (features)", 32 | ) 33 | parser.add_argument( 34 | "-C", "--num-classes", required=False, type=int, default=2, help="num classes" 35 | ) 36 | parser.add_argument( 37 | "-s", "--seed", required=False, type=int, default=1234, help="random seed" 38 | ) 39 | parser.add_argument( 40 | "-T", 41 | "--target", 42 | required=False, 43 | type=float, 44 | default=0.8, 45 | help="target accuracy", 46 | ) 47 | 48 | args = parser.parse_args() 49 | 50 | seed = int(args.seed) 51 | np.random.seed(seed) 52 | 53 | num_rows = int(args.num_rows) 54 | num_cols = int(args.num_cols) 55 | num_classes = int(args.num_classes) 56 | target = float(args.target) 57 | 58 | if num_classes > 0: 59 | x, y = make_classification( 60 | n_samples=num_rows, 61 | n_features=num_cols, 62 | n_informative=num_cols // 2, 63 | n_redundant=num_cols // 10, 64 | n_repeated=0, 65 | n_classes=num_classes, 66 | n_clusters_per_class=2, 67 | flip_y=1 - target, 68 | random_state=seed, 69 | ) 70 | else: 71 | x, y = make_regression( 72 | n_samples=num_rows, 73 | n_features=num_cols, 74 | n_informative=num_cols // 2, 75 | n_targets=1, 76 | noise=0.1, 77 | random_state=seed, 78 | ) 79 | 80 | filename = args.filename 81 | num_partitions = args.num_partitions 82 | 83 | data = pd.DataFrame(x, columns=[f"feature_{i}" for i in range(num_cols)]) 84 | 85 | rows_per_partition = np.floor(len(data) / num_partitions) 86 | 87 | partition_arr = np.repeat(np.arange(num_partitions), repeats=rows_per_partition) 88 | if len(partition_arr) < len(data): 89 | # If this was not evenly divided, append 90 | missing = len(data) - len(partition_arr) 91 | partition_arr = np.append(partition_arr, np.arange(missing)) 92 | 93 | partition = pd.Series(partition_arr, copy=False, dtype=np.int32) 94 | 95 | data["labels"] = y 96 | data["partition"] = partition 97 | 98 | os.makedirs(filename, 0o755, exist_ok=True) 99 | 100 | # Write partition-wise to avoid OOM errors 101 | for i in range(num_partitions): 102 | part = data[partition_arr == i] 103 | part.to_parquet( 104 | filename, 105 | partition_cols=["partition"], 106 | engine="pyarrow", 107 | partition_filename_cb=lambda key: f"part_{key[0]}.parquet", 108 | ) 109 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/create_test_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | from xgboost_ray.tests.utils import create_parquet 6 | 7 | if __name__ == "__main__": 8 | if "OMP_NUM_THREADS" in os.environ: 9 | del os.environ["OMP_NUM_THREADS"] 10 | 11 | parser = argparse.ArgumentParser(description="Create fake data.") 12 | parser.add_argument( 13 | "filename", type=str, default="/data/parted.parquet/", help="ray/dask" 14 | ) 15 | parser.add_argument( 16 | "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" 17 | ) 18 | parser.add_argument( 19 | "-p", 20 | "--num-partitions", 21 | required=False, 22 | type=int, 23 | default=100, 24 | help="num partitions", 25 | ) 26 | parser.add_argument( 27 | "-c", 28 | "--num-cols", 29 | required=False, 30 | type=int, 31 | default=4, 32 | help="num columns (features)", 33 | ) 34 | parser.add_argument( 35 | "-C", "--num-classes", required=False, type=int, default=2, help="num classes" 36 | ) 37 | parser.add_argument( 38 | "-s", "--seed", required=False, type=int, default=1234, help="random seed" 39 | ) 40 | 41 | args = parser.parse_args() 42 | 43 | np.random.seed(args.seed) 44 | create_parquet( 45 | args.filename, 46 | num_rows=int(args.num_rows), 47 | num_partitions=int(args.num_partitions), 48 | num_features=int(args.num_cols), 49 | num_classes=int(args.num_classes), 50 | ) 51 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/custom_objective_metric.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | from lightgbm_ray.tests.test_lightgbm_api import LightGBMAPITest 4 | 5 | 6 | class LightGBMDistributedAPITest(LightGBMAPITest): 7 | def _init_ray(self): 8 | if not ray.is_initialized(): 9 | ray.init(address="auto") 10 | 11 | 12 | if __name__ == "__main__": 13 | import sys 14 | 15 | import pytest 16 | 17 | sys.exit(pytest.main(["-v", f"{__file__}::LightGBMDistributedAPITest"])) 18 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/run_e2e_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | NOW=$(date +%s) 9 | export SESSION_NAME="lightgbm_ray_ci_gpu_${NOW}" 10 | export NUM_WORKERS=3 11 | export LIGHTGBM_RAY_PACKAGE="git+https://github.com/ray-project/lightgbm_ray.git@${GITHUB_SHA:-master}#lightgbm_ray" 12 | export NO_TMUX=1 13 | 14 | ./start_gpu_cluster.sh 15 | ./submit_cpu_gpu_benchmark.sh 4 100 100 --gpu --file /data/classification.parquet 16 | anyscale down "${SESSION_NAME}" 17 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/setup_lightgbm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install pytest 4 | # Uninstall any existing lightgbm_ray repositories 5 | pip uninstall -y lightgbm_ray || true 6 | 7 | # Install lightgbm package 8 | pip install -U "${LIGHTGBM_RAY_PACKAGE:-lightgbm_ray}" 9 | 10 | # Create test dataset 11 | sudo mkdir -p /data || true 12 | sudo chown ray:1000 /data || true 13 | rm -rf /data/classification.parquet || true 14 | cp -R /tmp/ray_tmp_mount/lightgbm_tests ~/lightgbm_tests || echo "Copy failed" 15 | python ~/lightgbm_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 16 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/start_cpu_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | export LIGHTGBM_RAY_PACKAGE="${LIGHTGBM_RAY_PACKAGE:-lightgbm_ray}" 9 | export NUM_WORKERS="${NUM_WORKERS:-3}" 10 | 11 | SESSION_NAME=${SESSION_NAME:-lightgbm_ray_release_cpu_$(date +%s)} 12 | 13 | echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)" 14 | echo "This will install lightgbm_ray using the following package: ${LIGHTGBM_RAY_PACKAGE}" 15 | 16 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_cpu.yaml ${SESSION_NAME}" 17 | 18 | echo "Running: ${CMD}" 19 | ${CMD} 20 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/start_gpu_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | export LIGHTGBM_RAY_PACKAGE="${LIGHTGBM_RAY_PACKAGE:-lightgbm_ray}" 9 | export NUM_WORKERS="${NUM_WORKERS:-3}" 10 | 11 | SESSION_NAME=${SESSION_NAME:-lightgbm_ray_release_gpu_$(date +%s)} 12 | 13 | echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)" 14 | echo "This will install lightgbm_ray using the following package: ${LIGHTGBM_RAY_PACKAGE}" 15 | 16 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_gpu.yaml ${SESSION_NAME}" 17 | 18 | echo "Running: ${CMD}" 19 | ${CMD} 20 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/release/submit_cpu_gpu_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | ANYSCALE_CMD="python ~/lightgbm_tests/benchmark_cpu_gpu.py $*" 9 | 10 | SESSION_STR="" 11 | if [ -n "${SESSION_NAME}" ]; then 12 | SESSION_STR="--session-name ${SESSION_NAME}" 13 | fi 14 | 15 | TMUX="--tmux" 16 | if [ "${NO_TMUX}" = "1" ]; then 17 | TMUX="" 18 | fi 19 | 20 | CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}" 21 | 22 | echo "Running: ${CMD}" 23 | ${CMD} 24 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/test_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import ray 5 | from ray.util.client.ray_client_helpers import ray_start_client_server 6 | 7 | 8 | @pytest.fixture 9 | def start_client_server_4_cpus(): 10 | ray.init(num_cpus=4) 11 | with ray_start_client_server() as client: 12 | yield client 13 | 14 | 15 | @pytest.fixture 16 | def start_client_server_5_cpus(): 17 | ray.init(num_cpus=5) 18 | with ray_start_client_server() as client: 19 | yield client 20 | 21 | 22 | @pytest.fixture 23 | def start_client_server_5_cpus_modin(monkeypatch): 24 | monkeypatch.setenv("__MODIN_AUTOIMPORT_PANDAS__", "1") 25 | ray.init(num_cpus=5, runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}) 26 | with ray_start_client_server() as client: 27 | yield client 28 | 29 | 30 | def test_simple_train(start_client_server_4_cpus): 31 | assert ray.util.client.ray.is_connected() 32 | from lightgbm_ray.examples.simple import main 33 | 34 | main(num_actors=2, cpus_per_actor=2) 35 | 36 | 37 | @pytest.mark.skipif(os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests") 38 | def test_simple_tune(start_client_server_4_cpus): 39 | assert ray.util.client.ray.is_connected() 40 | from lightgbm_ray.examples.simple_tune import main 41 | 42 | main(cpus_per_actor=2, num_actors=1, num_samples=4) 43 | 44 | 45 | def test_simple_dask(start_client_server_5_cpus): 46 | assert ray.util.client.ray.is_connected() 47 | from lightgbm_ray.examples.simple_dask import main 48 | 49 | main(cpus_per_actor=2, num_actors=2) 50 | 51 | 52 | def test_simple_modin(start_client_server_5_cpus_modin): 53 | assert ray.util.client.ray.is_connected() 54 | from lightgbm_ray.examples.simple_modin import main 55 | 56 | main(cpus_per_actor=2, num_actors=2) 57 | 58 | 59 | if __name__ == "__main__": 60 | import sys 61 | 62 | import pytest # noqa: F811 63 | 64 | sys.exit(pytest.main(["-v", __file__])) 65 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/test_end_to_end.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import tempfile 5 | import unittest 6 | 7 | import lightgbm as lgbm 8 | import numpy as np 9 | import ray 10 | from ray.exceptions import RayActorError, RayTaskError 11 | from xgboost_ray.callback import DistributedCallback 12 | 13 | from lightgbm_ray import RayDMatrix, RayParams, RayShardingMode, predict, train 14 | from lightgbm_ray.main import RayXGBoostTrainingError 15 | 16 | # from sklearn.utils import shuffle 17 | 18 | logging.getLogger("lightgbm_ray.main").setLevel(logging.DEBUG) 19 | 20 | 21 | def _make_callback(tmpdir: str) -> DistributedCallback: 22 | class TestDistributedCallback(DistributedCallback): 23 | logdir = tmpdir 24 | 25 | def on_init(self, actor, *args, **kwargs): 26 | log_file = os.path.join(self.logdir, f"rank_{actor.rank}.log") 27 | actor.log_fp = open(log_file, "at") 28 | actor.log_fp.write(f"Actor {actor.rank}: Init\n") 29 | actor.log_fp.flush() 30 | 31 | def before_data_loading(self, actor, data, *args, **kwargs): 32 | actor.log_fp.write(f"Actor {actor.rank}: Before loading\n") 33 | actor.log_fp.flush() 34 | 35 | def after_data_loading(self, actor, data, *args, **kwargs): 36 | actor.log_fp.write(f"Actor {actor.rank}: After loading\n") 37 | actor.log_fp.flush() 38 | 39 | def before_train(self, actor, *args, **kwargs): 40 | actor.log_fp.write(f"Actor {actor.rank}: Before train\n") 41 | actor.log_fp.flush() 42 | 43 | def after_train(self, actor, result_dict, *args, **kwargs): 44 | actor.log_fp.write(f"Actor {actor.rank}: After train\n") 45 | actor.log_fp.flush() 46 | 47 | def before_predict(self, actor, *args, **kwargs): 48 | actor.log_fp.write(f"Actor {actor.rank}: Before predict\n") 49 | actor.log_fp.flush() 50 | 51 | def after_predict(self, actor, predictions, *args, **kwargs): 52 | actor.log_fp.write(f"Actor {actor.rank}: After predict\n") 53 | actor.log_fp.flush() 54 | 55 | return TestDistributedCallback() 56 | 57 | 58 | class LGBMRayEndToEndTest(unittest.TestCase): 59 | """In this test suite we validate Ray-XGBoost multi class prediction. 60 | 61 | First, we validate that XGBoost is able to achieve 100% accuracy on 62 | a simple training task. 63 | 64 | Then we split the dataset into two halves. These halves don't have access 65 | to all relevant data, so overfit on their respective data. I.e. the first 66 | half always predicts feature 2 -> label 2, while the second half always 67 | predicts feature 2 -> label 3. 68 | 69 | We then train using Ray XGBoost. Again both halves will be trained 70 | separately, but because of Rabit's allreduce, they should end up being 71 | able to achieve 100% accuracy, again.""" 72 | 73 | def setUp(self): 74 | repeat = 64 # Repeat data a couple of times for stability 75 | self.x = np.array( 76 | [ 77 | [1, 0, 0, 0], # Feature 0 -> Label 0 78 | [0, 1, 0, 0], # Feature 1 -> Label 1 79 | [0, 0, 1, 1], # Feature 2+3 -> Label 2 80 | [0, 0, 1, 0], # Feature 2+!3 -> Label 3 81 | ] 82 | * repeat 83 | ) 84 | self.y = np.array([0, 1, 2, 3] * repeat) 85 | 86 | # self.x, self.y = shuffle(self.x, self.y, random_state=1) 87 | 88 | self.params = { 89 | "boosting": "gbdt", 90 | "objective": "multiclass", 91 | "num_class": 4, 92 | "random_state": 1, 93 | "tree_learner": "data", 94 | "deterministic": True, 95 | } 96 | 97 | def tearDown(self): 98 | ray.shutdown() 99 | 100 | def testSingleTraining(self): 101 | """Test that XGBoost learns to predict full matrix""" 102 | dtrain = lgbm.Dataset(self.x, self.y) 103 | bst = lgbm.train(self.params, dtrain, num_boost_round=2) 104 | 105 | pred_y = np.argmax(bst.predict(self.x), axis=1) 106 | self.assertSequenceEqual(list(self.y), list(pred_y)) 107 | 108 | def testHalfTraining(self): 109 | """Test that XGBoost learns to predict half matrices individually""" 110 | x_first = self.x[::2] 111 | y_first = self.y[::2] 112 | 113 | x_second = self.x[1::2] 114 | y_second = self.y[1::2] 115 | 116 | # Test case: The first model only sees feature 2 --> label 2 117 | # and the second model only sees feature 2 --> label 3 118 | test_X = np.array([[0, 0, 1, 1], [0, 0, 1, 0]]) 119 | test_y_first = [2, 2] 120 | test_y_second = [3, 3] 121 | 122 | # First half 123 | dtrain = lgbm.Dataset(x_first, y_first) 124 | bst = lgbm.train(self.params, dtrain, num_boost_round=2) 125 | 126 | pred_y = np.argmax(bst.predict(x_first), axis=1) 127 | self.assertSequenceEqual(list(y_first), list(pred_y)) 128 | 129 | pred_test = np.argmax(bst.predict(test_X), axis=1) 130 | self.assertSequenceEqual(test_y_first, list(pred_test)) 131 | 132 | # Second half 133 | dtrain = lgbm.Dataset(x_second, y_second) 134 | bst = lgbm.train(self.params, dtrain, num_boost_round=2) 135 | 136 | pred_y = np.argmax(bst.predict(x_second), axis=1) 137 | self.assertSequenceEqual(list(y_second), list(pred_y)) 138 | 139 | pred_test = np.argmax(bst.predict(test_X), axis=1) 140 | self.assertSequenceEqual(test_y_second, list(pred_test)) 141 | 142 | def _testJointTraining(self, cpus_per_actor): 143 | ray.init(num_cpus=4, num_gpus=0, include_dashboard=False) 144 | 145 | bst = train( 146 | self.params, 147 | RayDMatrix(self.x, self.y, sharding=RayShardingMode.BATCH), 148 | num_boost_round=50, 149 | ray_params=RayParams(num_actors=2, cpus_per_actor=cpus_per_actor), 150 | ) 151 | 152 | self.assertEqual(bst.booster_.current_iteration(), 50) 153 | 154 | pred_y = bst.predict(self.x) 155 | pred_y = np.argmax(pred_y, axis=1) 156 | self.assertSequenceEqual(list(self.y), list(pred_y)) 157 | 158 | pred_y = predict( 159 | bst, 160 | RayDMatrix(self.x), 161 | ray_params=RayParams(num_actors=2, cpus_per_actor=cpus_per_actor), 162 | ) 163 | pred_y = np.argmax(pred_y, axis=1) 164 | self.assertSequenceEqual(list(self.y), list(pred_y)) 165 | 166 | pred_y = predict( 167 | bst.booster_, 168 | RayDMatrix(self.x), 169 | ray_params=RayParams(num_actors=2, cpus_per_actor=cpus_per_actor), 170 | ) 171 | pred_y = np.argmax(pred_y, axis=1) 172 | self.assertSequenceEqual(list(self.y), list(pred_y)) 173 | 174 | def testJointTraining(self): 175 | """Train with Ray. The data will be split, but the trees 176 | should be combined together and find the true model.""" 177 | return self._testJointTraining(cpus_per_actor=2) 178 | 179 | def testJointTrainingDefaultRayParams(self): 180 | """Train with Ray. The data will be split, but the trees 181 | should be combined together and find the true model.""" 182 | return self._testJointTraining(cpus_per_actor=0) 183 | 184 | def testCpusPerActorEqualTo1RaisesException(self): 185 | ray.init(num_cpus=4, num_gpus=0, include_dashboard=False) 186 | with self.assertRaisesRegex(ValueError, "cpus_per_actor is set to less than 2"): 187 | train( 188 | self.params, 189 | RayDMatrix(self.x, self.y), 190 | num_boost_round=50, 191 | ray_params=RayParams(num_actors=2, cpus_per_actor=1), 192 | ) 193 | 194 | def testBothEvalsAndValidSetsRaisesException(self): 195 | ray.init(num_cpus=4, num_gpus=0, include_dashboard=False) 196 | with self.assertRaisesRegex( 197 | ValueError, "Specifying both `evals` and `valid_sets` is ambiguous" 198 | ): 199 | data = (RayDMatrix(self.x, self.y),) 200 | train( 201 | self.params, 202 | data, 203 | num_boost_round=50, 204 | ray_params=RayParams(num_actors=2), 205 | evals=[(data, "eval")], 206 | valid_sets=[data], 207 | ) 208 | 209 | def testTrainPredict(self, init=True, remote=None, **ray_param_dict): 210 | """Train with evaluation and predict""" 211 | if init: 212 | ray.init(num_cpus=8, num_gpus=0, include_dashboard=False) 213 | 214 | dtrain = RayDMatrix(self.x, self.y, sharding=RayShardingMode.BATCH) 215 | 216 | params = self.params 217 | 218 | evals_result = {} 219 | bst = train( 220 | params, 221 | dtrain, 222 | num_boost_round=38, 223 | ray_params=RayParams( 224 | num_actors=2, 225 | cpus_per_actor=1, 226 | allow_less_than_two_cpus=True, 227 | **ray_param_dict, 228 | ), 229 | evals=[(dtrain, "dtrain")], 230 | evals_result=evals_result, 231 | _remote=remote, 232 | ) 233 | 234 | self.assertTrue("dtrain" in evals_result) 235 | 236 | evals_result = {} 237 | bst = train( 238 | params, 239 | dtrain, 240 | num_boost_round=38, 241 | ray_params=RayParams( 242 | num_actors=2, 243 | cpus_per_actor=1, 244 | allow_less_than_two_cpus=True, 245 | **ray_param_dict, 246 | ), 247 | valid_sets=[dtrain], 248 | valid_names=["dtrain"], 249 | evals_result=evals_result, 250 | _remote=remote, 251 | ) 252 | 253 | self.assertTrue("dtrain" in evals_result) 254 | 255 | x_mat = RayDMatrix(self.x) 256 | pred_y = predict( 257 | bst, 258 | x_mat, 259 | ray_params=RayParams( 260 | num_actors=2, 261 | cpus_per_actor=1, 262 | allow_less_than_two_cpus=True, 263 | **ray_param_dict, 264 | ), 265 | _remote=remote, 266 | ) 267 | 268 | self.assertEqual(pred_y.shape[1], len(np.unique(self.y))) 269 | pred_y = np.argmax(pred_y, axis=1) 270 | 271 | self.assertSequenceEqual(list(self.y), list(pred_y)) 272 | 273 | def testTrainPredictRemote(self): 274 | """Train with evaluation and predict in a remote call""" 275 | self.testTrainPredict(init=True, remote=True) 276 | 277 | def testTrainPredictClient(self): 278 | """Train with evaluation and predict in a client session""" 279 | if ray.__version__ <= "1.2.0": 280 | self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") 281 | from ray.util.client.ray_client_helpers import ray_start_client_server 282 | 283 | # (yard1) this hangs when num_cpus=2 284 | ray.init(num_cpus=8, num_gpus=0, include_dashboard=False) 285 | self.assertFalse(ray.util.client.ray.is_connected()) 286 | with ray_start_client_server(): 287 | self.assertTrue(ray.util.client.ray.is_connected()) 288 | 289 | self.testTrainPredict(init=False, remote=None) 290 | 291 | def testDistributedCallbacksTrainPredict(self, init=True, remote=False): 292 | """Test distributed callbacks for train/predict""" 293 | tmpdir = tempfile.mkdtemp() 294 | test_callback = _make_callback(tmpdir) 295 | 296 | self.testTrainPredict( 297 | init=init, remote=remote, distributed_callbacks=[test_callback] 298 | ) 299 | rank_0_log_file = os.path.join(tmpdir, "rank_0.log") 300 | rank_1_log_file = os.path.join(tmpdir, "rank_1.log") 301 | self.assertTrue(os.path.exists(rank_1_log_file)) 302 | 303 | rank_0_log = open(rank_0_log_file, "rt").read() 304 | self.assertEqual( 305 | rank_0_log, 306 | "Actor 0: Init\n" 307 | "Actor 0: Before loading\n" 308 | "Actor 0: After loading\n" 309 | "Actor 0: Before train\n" 310 | "Actor 0: After train\n" 311 | "Actor 0: Init\n" 312 | "Actor 0: Before loading\n" 313 | "Actor 0: After loading\n" 314 | "Actor 0: Before train\n" 315 | "Actor 0: After train\n" 316 | "Actor 0: Init\n" 317 | "Actor 0: Before loading\n" 318 | "Actor 0: After loading\n" 319 | "Actor 0: Before predict\n" 320 | "Actor 0: After predict\n", 321 | ) 322 | shutil.rmtree(tmpdir) 323 | 324 | def testDistributedCallbacksTrainPredictClient(self): 325 | """Test distributed callbacks for train/predict via Ray client""" 326 | 327 | if ray.__version__ <= "1.2.0": 328 | self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") 329 | from ray.util.client.ray_client_helpers import ray_start_client_server 330 | 331 | ray.init(num_cpus=8, num_gpus=0, include_dashboard=False) 332 | self.assertFalse(ray.util.client.ray.is_connected()) 333 | with ray_start_client_server(): 334 | self.assertTrue(ray.util.client.ray.is_connected()) 335 | 336 | self.testDistributedCallbacksTrainPredict(init=False, remote=None) 337 | 338 | def testFailPrintErrors(self): 339 | """Test that XGBoost training errors are propagated""" 340 | x = np.random.uniform(0, 1, size=(100, 4)) 341 | y = np.random.randint(0, 2, size=100) 342 | 343 | train_set = RayDMatrix(x, y) 344 | 345 | try: 346 | train( 347 | { 348 | **self.params, 349 | **{"num_class": 2, "metric": ["multi_logloss", "multi_error"]}, 350 | }, # This will error 351 | train_set, 352 | evals=[(train_set, "train")], 353 | ray_params=RayParams( 354 | num_actors=1, cpus_per_actor=2, max_actor_restarts=0 355 | ), 356 | ) 357 | except RuntimeError as exc: 358 | self.assertTrue(exc.__cause__) 359 | self.assertTrue(isinstance(exc.__cause__, RayActorError)) 360 | 361 | self.assertTrue(exc.__cause__.__cause__) 362 | self.assertTrue(isinstance(exc.__cause__.__cause__, RayTaskError)) 363 | 364 | self.assertTrue(exc.__cause__.__cause__.cause) 365 | self.assertTrue( 366 | isinstance(exc.__cause__.__cause__.cause, RayXGBoostTrainingError) 367 | ) 368 | 369 | self.assertIn( 370 | "label and prediction size not match", str(exc.__cause__.__cause__) 371 | ) 372 | 373 | 374 | class LGBMRayEndToEndTestVoting(LGBMRayEndToEndTest): 375 | def setUp(self): 376 | super().setUp() 377 | self.params["tree_learner"] = "voting" 378 | 379 | 380 | if __name__ == "__main__": 381 | import sys 382 | 383 | import pytest 384 | 385 | sys.exit(pytest.main(["-v", __file__])) 386 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/test_fault_tolerance.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import time 5 | import unittest 6 | from unittest.mock import DEFAULT, patch 7 | 8 | import lightgbm 9 | import numpy as np 10 | import ray 11 | from lightgbm import LGBMModel 12 | from sklearn.utils import shuffle 13 | from xgboost_ray.session import get_actor_rank, put_queue 14 | from xgboost_ray.tests.utils import flatten_obj 15 | 16 | from lightgbm_ray import RayDMatrix, RayParams, train 17 | 18 | 19 | def get_num_trees(model_or_booster): 20 | if isinstance(model_or_booster, LGBMModel): 21 | return model_or_booster.booster_.current_iteration() 22 | return model_or_booster.current_iteration() 23 | 24 | 25 | def _kill_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): 26 | """Returns a callback to kill an actor process. 27 | 28 | Args: 29 | die_lock_file: A file lock used to prevent race conditions 30 | when killing the actor. 31 | actor_rank: The rank of the actor to kill. 32 | fail_iteration: The iteration after which the actor is killed. 33 | 34 | """ 35 | 36 | def _callback(env): 37 | if get_actor_rank() == actor_rank: 38 | put_queue((env.iteration, time.time())) 39 | if ( 40 | get_actor_rank() == actor_rank 41 | and env.iteration == fail_iteration 42 | and not os.path.exists(die_lock_file) 43 | ): 44 | 45 | # Get PID 46 | pid = os.getpid() 47 | print(f"Killing process: {pid}") 48 | with open(die_lock_file, "wt") as fp: 49 | fp.write("") 50 | 51 | time.sleep(2) 52 | print(f"Testing: Rank {get_actor_rank()} will now die.") 53 | os.kill(pid, 9) 54 | 55 | _callback.order = 10 # type: ignore 56 | return _callback 57 | 58 | 59 | def _checkpoint_callback(frequency: int = 1, before_iteration_=False): 60 | """Returns a callback to checkpoint a model. 61 | 62 | Args: 63 | frequency: The interval at which checkpointing occurs. If 64 | frequency is set to n, checkpointing occurs every n epochs. 65 | before_iteration_: If True, checkpoint before the iteration 66 | begins. Else, checkpoint after the iteration ends. 67 | 68 | """ 69 | 70 | def _callback(env): 71 | if env.iteration % frequency == 0: 72 | put_queue(env.model.model_to_string()) 73 | 74 | _callback.before_iteration = before_iteration_ 75 | return _callback 76 | 77 | 78 | def _fail_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): 79 | """Returns a callback to cause an Xgboost actor to fail training. 80 | 81 | Args: 82 | die_lock_file: A file lock used to prevent race conditions 83 | when causing the actor to fail. 84 | actor_rank: The rank of the actor to fail. 85 | fail_iteration: The iteration after which the training for 86 | the specified actor fails. 87 | 88 | """ 89 | 90 | def _callback(env): 91 | if get_actor_rank() == actor_rank: 92 | put_queue((env.iteration, time.time())) 93 | if ( 94 | get_actor_rank() == actor_rank 95 | and env.iteration == fail_iteration 96 | and not os.path.exists(die_lock_file) 97 | ): 98 | 99 | with open(die_lock_file, "wt") as fp: 100 | fp.write("") 101 | time.sleep(2) 102 | import sys 103 | 104 | print(f"Testing: Rank {get_actor_rank()} will now fail.") 105 | sys.exit(1) 106 | 107 | return _callback 108 | 109 | 110 | class LightGBMRayFaultToleranceTest(unittest.TestCase): 111 | """In this test suite we validate fault tolerance when a Ray actor dies. 112 | 113 | For this, we set up a callback that makes one worker die exactly once. 114 | """ 115 | 116 | def setUp(self): 117 | repeat = 64 # Repeat data a couple of times for stability 118 | self.x = np.array( 119 | [ 120 | [1, 0, 0, 0], # Feature 0 -> Label 0 121 | [0, 1, 0, 0], # Feature 1 -> Label 1 122 | [0, 0, 1, 1], # Feature 2+3 -> Label 2 123 | [0, 0, 1, 0], # Feature 2+!3 -> Label 3 124 | ] 125 | * repeat 126 | ) 127 | self.y = np.array([0, 1, 2, 3] * repeat) 128 | 129 | self.x, self.y = shuffle(self.x, self.y, random_state=1) 130 | 131 | self.params = { 132 | "nthread": 2, 133 | "max_depth": 2, 134 | "num_leaves": 2, 135 | "tree_learner": "data", 136 | "objective": "multiclass", 137 | "num_class": 4, 138 | "random_state": 1, 139 | "deterministic": True, 140 | "time_out": 1, 141 | } 142 | 143 | self.tmpdir = str(tempfile.mkdtemp()) 144 | 145 | self.die_lock_file = "/tmp/died_worker.lock" 146 | if os.path.exists(self.die_lock_file): 147 | os.remove(self.die_lock_file) 148 | 149 | self.die_lock_file_2 = "/tmp/died_worker_2.lock" 150 | if os.path.exists(self.die_lock_file_2): 151 | os.remove(self.die_lock_file_2) 152 | 153 | def tearDown(self) -> None: 154 | if os.path.exists(self.tmpdir): 155 | shutil.rmtree(self.tmpdir) 156 | ray.shutdown() 157 | 158 | if os.path.exists(self.die_lock_file): 159 | os.remove(self.die_lock_file) 160 | 161 | if os.path.exists(self.die_lock_file_2): 162 | os.remove(self.die_lock_file_2) 163 | 164 | def testTrainingContinuationKilled(self): 165 | """This should continue after one actor died.""" 166 | ray.init(num_cpus=4, num_gpus=0, log_to_driver=True) 167 | additional_results = {} 168 | keep_actors = {} 169 | 170 | def keep(actors, *args, **kwargs): 171 | keep_actors["actors"] = actors.copy() 172 | return DEFAULT 173 | 174 | with patch("lightgbm_ray.main._shutdown") as mocked: 175 | mocked.side_effect = keep 176 | bst = train( 177 | self.params, 178 | RayDMatrix(self.x, self.y), 179 | callbacks=[_kill_callback(self.die_lock_file)], 180 | num_boost_round=50, 181 | ray_params=RayParams( 182 | max_actor_restarts=1, num_actors=2, cpus_per_actor=2 183 | ), 184 | additional_results=additional_results, 185 | ) 186 | 187 | self.assertEqual(50, get_num_trees(bst)) 188 | 189 | pred_y = bst.predict(self.x) 190 | pred_y = np.argmax(pred_y, axis=1) 191 | self.assertSequenceEqual(list(self.y), list(pred_y)) 192 | print(f"Got correct predictions: {pred_y}") 193 | 194 | actors = keep_actors["actors"] 195 | # End with two working actors 196 | self.assertTrue(actors[0]) 197 | self.assertTrue(actors[1]) 198 | 199 | # Two workers finished, so N=64*4 200 | self.assertEqual(additional_results["total_n"], 64 * 4) 201 | 202 | def testTrainingStop(self): 203 | """This should now stop training after one actor died.""" 204 | # The `train()` function raises a RuntimeError 205 | ray.init(num_cpus=4, num_gpus=0, log_to_driver=True) 206 | with self.assertRaises(RuntimeError): 207 | train( 208 | self.params, 209 | RayDMatrix(self.x, self.y), 210 | callbacks=[_kill_callback(self.die_lock_file)], 211 | num_boost_round=20, 212 | ray_params=RayParams(max_actor_restarts=0, num_actors=2), 213 | ) 214 | 215 | def testCheckpointContinuationValidity(self): 216 | """Test that checkpoints are stored and loaded correctly""" 217 | 218 | ray.init(num_cpus=4, num_gpus=0, log_to_driver=True) 219 | # Train once, get checkpoint via callback returns 220 | res_1 = {} 221 | train( 222 | self.params, 223 | RayDMatrix(self.x, self.y), 224 | callbacks=[_checkpoint_callback(frequency=1, before_iteration_=False)], 225 | num_boost_round=2, 226 | ray_params=RayParams(num_actors=2, cpus_per_actor=2), 227 | additional_results=res_1, 228 | ) 229 | last_checkpoint_1 = res_1["callback_returns"][0][-1] 230 | 231 | lc1 = lightgbm.Booster(model_str=last_checkpoint_1) 232 | 233 | # Start new training run, starting from existing model 234 | res_2 = {} 235 | train( 236 | self.params, 237 | RayDMatrix(self.x, self.y), 238 | callbacks=[ 239 | _checkpoint_callback(frequency=1, before_iteration_=True), 240 | _checkpoint_callback(frequency=1, before_iteration_=False), 241 | ], 242 | num_boost_round=4, 243 | ray_params=RayParams(num_actors=2, cpus_per_actor=2), 244 | additional_results=res_2, 245 | init_model=lc1, 246 | ) 247 | first_checkpoint_2 = res_2["callback_returns"][0][0] 248 | last_checkpoint_2 = res_2["callback_returns"][0][-1] 249 | 250 | fcp_bst = lightgbm.Booster(model_str=first_checkpoint_2) 251 | 252 | lcp_bst = lightgbm.Booster(model_str=last_checkpoint_2) 253 | 254 | # Training should not have proceeded for the first checkpoint, 255 | # so trees should be equal 256 | self.assertEqual(lc1.current_iteration(), fcp_bst.current_iteration()) 257 | 258 | # Training should have proceeded for the last checkpoint, 259 | # so trees should not be equal 260 | self.assertNotEqual(fcp_bst.model_to_string(), lcp_bst.model_to_string()) 261 | 262 | def testSameResultWithAndWithoutError(self): 263 | """Get the same model with and without errors during training.""" 264 | 265 | ray.init(num_cpus=5, num_gpus=0, log_to_driver=True) 266 | # Run training 267 | print("test no error") 268 | bst_noerror = train( 269 | self.params, 270 | RayDMatrix(self.x, self.y), 271 | num_boost_round=10, 272 | ray_params=RayParams(max_actor_restarts=0, num_actors=2, cpus_per_actor=2), 273 | ) 274 | 275 | print("test part 1") 276 | bst_2part_1 = train( 277 | self.params, 278 | RayDMatrix(self.x, self.y), 279 | num_boost_round=5, 280 | ray_params=RayParams(max_actor_restarts=0, num_actors=2, cpus_per_actor=2), 281 | ) 282 | 283 | print("test part 2") 284 | bst_2part_2 = train( 285 | self.params, 286 | RayDMatrix(self.x, self.y), 287 | num_boost_round=5, 288 | ray_params=RayParams(max_actor_restarts=0, num_actors=2, cpus_per_actor=2), 289 | init_model=bst_2part_1, 290 | ) 291 | 292 | print("test error") 293 | res_error = {} 294 | bst_error = train( 295 | self.params, 296 | RayDMatrix(self.x, self.y), 297 | callbacks=[_fail_callback(self.die_lock_file, fail_iteration=7)], 298 | num_boost_round=10, 299 | ray_params=RayParams( 300 | max_actor_restarts=1, 301 | num_actors=2, 302 | checkpoint_frequency=5, 303 | cpus_per_actor=2, 304 | ), 305 | additional_results=res_error, 306 | ) 307 | 308 | self.assertEqual( 309 | bst_error.booster_.current_iteration(), 310 | bst_noerror.booster_.current_iteration(), 311 | ) 312 | self.assertEqual( 313 | bst_2part_2.booster_.current_iteration(), 314 | bst_noerror.booster_.current_iteration(), 315 | ) 316 | 317 | flat_noerror = flatten_obj({"tree": bst_noerror.booster_.dump_model()}) 318 | flat_error = flatten_obj({"tree": bst_error.booster_.dump_model()}) 319 | flat_2part = flatten_obj({"tree": bst_2part_2.booster_.dump_model()}) 320 | 321 | for key in flat_noerror: 322 | self.assertAlmostEqual(flat_noerror[key], flat_error[key], places=4) 323 | self.assertAlmostEqual(flat_noerror[key], flat_2part[key], places=4) 324 | 325 | # We fail at iteration 7, but checkpoints are saved at iteration 5 326 | # Thus we have two additional returns here. 327 | print("Callback returns:", res_error["callback_returns"][0]) 328 | self.assertEqual(len(res_error["callback_returns"][0]), 10 + 2) 329 | 330 | 331 | if __name__ == "__main__": 332 | import sys 333 | 334 | import pytest 335 | 336 | sys.exit(pytest.main(["-v", __file__])) 337 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/test_lightgbm.py: -------------------------------------------------------------------------------- 1 | """Tests for lightgbm-ray, based om lightgbm.dask tests""" 2 | 3 | # The MIT License (MIT) 4 | 5 | # Copyright (c) Microsoft Corporation 6 | 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | 14 | # The above copyright notice and this permission notice shall be included in 15 | # all copies or substantial portions of the Software. 16 | 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | # File based on: 26 | # https://github.com/microsoft/LightGBM/blob/c3b9363d02564625332583e166e3ab3135f436e3/tests/python_package_test/test_dask.py 27 | 28 | # License: 29 | # https://github.com/microsoft/LightGBM/blob/c3b9363d02564625332583e166e3ab3135f436e3/LICENSE 30 | 31 | import itertools 32 | import unittest 33 | 34 | import lightgbm as lgb 35 | import numpy as np 36 | import pandas as pd 37 | import ray 38 | import sklearn.utils.estimator_checks as sklearn_checks 39 | from parameterized import parameterized 40 | from sklearn.datasets import make_blobs, make_regression 41 | from sklearn.metrics import accuracy_score, r2_score 42 | from sklearn.model_selection import train_test_split 43 | from sklearn.utils import _safe_indexing 44 | 45 | from lightgbm_ray import RayDMatrix, RayParams, RayShardingMode 46 | from lightgbm_ray.sklearn import RayLGBMClassifier, RayLGBMRegressor 47 | 48 | data_output = [ 49 | "array", 50 | "dataframe", 51 | "dataframe-with-categorical", 52 | "raydmatrix-interleaved", # "raydmatrix-batch" 53 | ] 54 | data_output_local = [x for x in data_output if "raydmatrix" not in x] 55 | boosting_types = ["gbdt"] # "dart", "goss", "rf"] 56 | distributed_training_algorithms = ["data", "voting"] 57 | 58 | 59 | def sklearn_checks_to_run(): 60 | check_names = [ 61 | "check_estimator_get_tags_default_keys", 62 | "check_get_params_invariance", 63 | "check_set_params", 64 | ] 65 | checks = [] 66 | for check_name in check_names: 67 | check_func = getattr(sklearn_checks, check_name, None) 68 | if check_func: 69 | checks.append(check_func) 70 | return checks 71 | 72 | 73 | estimators_to_test = [RayLGBMClassifier, RayLGBMRegressor] 74 | 75 | 76 | def _create_data(objective, n_samples=2000, output="array", **kwargs): 77 | if objective.endswith("classification"): 78 | if objective == "binary-classification": 79 | centers = [[-4, -4], [4, 4]] 80 | elif objective == "multiclass-classification": 81 | centers = [[-4, -4], [4, 4], [-4, 4]] 82 | else: 83 | raise ValueError(f"Unknown classification task '{objective}'") 84 | X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) 85 | elif objective == "regression": 86 | X, y = make_regression( 87 | n_samples=n_samples, n_features=4, n_informative=2, random_state=42 88 | ) 89 | # elif objective == "ranking": 90 | # return _create_ranking_data( 91 | # n_samples=n_samples, 92 | # output=output, 93 | # chunk_size=chunk_size, 94 | # **kwargs 95 | # ) 96 | else: 97 | raise ValueError(f"Unknown objective '{objective}'") 98 | rnd = np.random.RandomState(42) 99 | weights = rnd.random(X.shape[0]) * 0.01 100 | 101 | def convert_data(X, y, weights): 102 | if output == "array": 103 | dX = X 104 | dy = y 105 | dw = weights 106 | elif output.startswith("dataframe"): 107 | X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])]) 108 | if output == "dataframe-with-categorical": 109 | num_cat_cols = 2 110 | for i in range(num_cat_cols): 111 | col_name = f"cat_col{i}" 112 | cat_values = rnd.choice(["a", "b"], X.shape[0]) 113 | cat_series = pd.Series(cat_values, dtype="category") 114 | X_df[col_name] = cat_series 115 | X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1))) 116 | 117 | # make one categorical feature relevant to the target 118 | cat_col_is_a = X_df["cat_col0"] == "a" 119 | if objective == "regression": 120 | y = np.where(cat_col_is_a, y, 2 * y) 121 | elif objective == "binary-classification": 122 | y = np.where(cat_col_is_a, y, 1 - y) 123 | elif objective == "multiclass-classification": 124 | n_classes = 3 125 | y = np.where(cat_col_is_a, y, (1 + y) % n_classes) 126 | y_df = pd.Series(y, name="target") 127 | dX = X_df 128 | dy = y_df 129 | dw = pd.Series(weights) 130 | elif output.startswith("raydmatrix"): 131 | sharding = { 132 | "raydmatrix-interleaved": RayShardingMode.INTERLEAVED, 133 | "raydmatrix-batch": RayShardingMode.BATCH, 134 | } 135 | dX = RayDMatrix(X, y, weights, sharding=sharding[output]) 136 | dy = None 137 | dw = None 138 | else: 139 | raise ValueError(f"Unknown output type '{output}'") 140 | return dX, dy, dw 141 | 142 | train_idx, test_idx = train_test_split( 143 | np.arange(0, len(X)), 144 | test_size=0.5, 145 | stratify=y if objective.endswith("classification") else None, 146 | random_state=42, 147 | shuffle=True, 148 | ) 149 | 150 | if output.startswith("raydmatrix"): 151 | dX, dy, dw = convert_data(X[train_idx], y[train_idx], weights[train_idx]) 152 | dX_test, dy_test, dw_test = convert_data( 153 | X[test_idx], y[test_idx], weights[test_idx] 154 | ) 155 | else: 156 | dX, dy, dw = convert_data(X, y, weights) 157 | dX_test = _safe_indexing(dX, test_idx) 158 | dy_test = _safe_indexing(dy, test_idx) 159 | dw_test = _safe_indexing(dw, test_idx) 160 | dX = _safe_indexing(dX, train_idx) 161 | dy = _safe_indexing(dy, train_idx) 162 | dw = _safe_indexing(dw, train_idx) 163 | 164 | return ( 165 | X[train_idx], 166 | y[train_idx], 167 | weights[train_idx], 168 | None, 169 | dX, 170 | dy, 171 | dw, 172 | None, 173 | dX_test, 174 | dy_test, 175 | dw_test, 176 | ) 177 | 178 | 179 | class LGBMRayTest(unittest.TestCase): 180 | def setUp(self): 181 | self.ray_params = RayParams(num_actors=2, cpus_per_actor=2) 182 | 183 | def tearDown(self): 184 | ray.shutdown() 185 | 186 | @parameterized.expand( 187 | list( 188 | itertools.product( 189 | data_output, 190 | ["binary-classification", "multiclass-classification"], 191 | boosting_types, 192 | distributed_training_algorithms, 193 | ) 194 | ) 195 | ) 196 | def testClassifier(self, output, task, boosting_type, tree_learner): 197 | ray.init(num_cpus=4, num_gpus=0) 198 | 199 | print(output, task, boosting_type, tree_learner) 200 | 201 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 202 | objective=task, output=output 203 | ) 204 | 205 | eval_weights = [dw_test] 206 | if dy_test is None: 207 | dy_test = "test" 208 | eval_weights = None 209 | eval_set = [(dX_test, dy_test)] 210 | 211 | if "raydmatrix" in output: 212 | lX = X 213 | ly = y 214 | lw = w 215 | else: 216 | lX = dX 217 | ly = dy 218 | lw = dw 219 | 220 | params = { 221 | "boosting_type": boosting_type, 222 | "tree_learner": tree_learner, 223 | "n_estimators": 50, 224 | "num_leaves": 31, 225 | "random_state": 1, 226 | "deterministic": True, 227 | } 228 | if boosting_type == "rf": 229 | params.update( 230 | { 231 | "bagging_freq": 1, 232 | "bagging_fraction": 0.9, 233 | } 234 | ) 235 | elif boosting_type == "goss": 236 | params["top_rate"] = 0.5 237 | 238 | ray_classifier = RayLGBMClassifier(**params) 239 | ray_classifier = ray_classifier.fit( 240 | dX, 241 | dy, 242 | sample_weight=dw, 243 | ray_params=self.ray_params, 244 | eval_set=eval_set, 245 | eval_sample_weight=eval_weights, 246 | ) 247 | ray_classifier = ray_classifier.fit( 248 | dX, 249 | dy, 250 | sample_weight=dw, 251 | ray_params=self.ray_params, 252 | ) 253 | p1 = ray_classifier.predict(dX, ray_params=self.ray_params) 254 | p1_proba = ray_classifier.predict_proba(dX, ray_params=self.ray_params) 255 | p1_pred_leaf = ray_classifier.predict( 256 | dX, pred_leaf=True, ray_params=self.ray_params 257 | ) 258 | p1_local = ray_classifier.to_local().predict(lX) 259 | s1 = accuracy_score(ly, p1) 260 | 261 | local_classifier = lgb.LGBMClassifier(**params) 262 | local_classifier.fit( 263 | lX, 264 | ly, 265 | sample_weight=lw, 266 | ) 267 | p2 = local_classifier.predict(lX) 268 | p2_proba = local_classifier.predict_proba(lX) 269 | s2 = local_classifier.score(lX, ly) 270 | 271 | if boosting_type == "rf": 272 | # https://github.com/microsoft/LightGBM/issues/4118 273 | self.assertTrue(np.allclose(s1, s2, atol=0.01)) 274 | self.assertTrue(np.allclose(p1_proba, p2_proba, atol=0.8)) 275 | else: 276 | self.assertTrue(np.allclose(s1, s2)) 277 | self.assertTrue(np.allclose(p1, p2)) 278 | self.assertTrue(np.allclose(p1, ly)) 279 | self.assertTrue(np.allclose(p2, ly)) 280 | self.assertTrue(np.allclose(p1_proba, p2_proba, atol=0.1)) 281 | self.assertTrue(np.allclose(p1_local, p2)) 282 | self.assertTrue(np.allclose(p1_local, ly)) 283 | 284 | # pref_leaf values should have the right shape 285 | # and values that look like valid tree nodes 286 | pred_leaf_vals = p1_pred_leaf 287 | assert pred_leaf_vals.shape == ( 288 | lX.shape[0], 289 | ray_classifier.booster_.num_trees(), 290 | ) 291 | assert np.max(pred_leaf_vals) <= params["num_leaves"] 292 | assert np.min(pred_leaf_vals) >= 0 293 | assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"] 294 | 295 | # be sure LightGBM actually used at least one categorical column, 296 | # and that it was correctly treated as a categorical feature 297 | if output == "dataframe-with-categorical": 298 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 299 | tree_df = ray_classifier.booster_.trees_to_dataframe() 300 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 301 | assert node_uses_cat_col.sum() > 0 302 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 303 | 304 | @parameterized.expand( 305 | list( 306 | itertools.product( 307 | data_output_local, 308 | ["binary-classification", "multiclass-classification"], 309 | ) 310 | ) 311 | ) 312 | def testClassifierEarlyStopping(self, output, task): 313 | ray.init(num_cpus=4, num_gpus=0) 314 | 315 | print(output, task) 316 | 317 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 318 | objective=task, output=output 319 | ) 320 | 321 | eval_weights = [dw_test] 322 | if dy_test is None: 323 | dy_test = "test" 324 | eval_weights = None 325 | eval_set = [(dX_test, dy_test)] 326 | 327 | if "raydmatrix" in output: 328 | lX = X 329 | ly = y 330 | lw = w 331 | else: 332 | lX = dX 333 | ly = dy 334 | lw = dw 335 | 336 | n_estimators = 400 337 | params = { 338 | "n_estimators": n_estimators, 339 | "num_leaves": 31, 340 | "random_state": 1, 341 | "deterministic": True, 342 | } 343 | 344 | callbacks = [lgb.early_stopping(1)] 345 | 346 | ray_classifier = RayLGBMClassifier(**params) 347 | ray_classifier = ray_classifier.fit( 348 | dX, 349 | dy, 350 | sample_weight=dw, 351 | ray_params=self.ray_params, 352 | eval_set=eval_set, 353 | eval_sample_weight=eval_weights, 354 | callbacks=callbacks, 355 | ) 356 | 357 | self.assertLess( 358 | len(list(ray_classifier.evals_result_["valid_0"].values())[0]), n_estimators 359 | ) 360 | 361 | p1 = ray_classifier.predict(dX, ray_params=self.ray_params) 362 | p1_proba = ray_classifier.predict_proba(dX, ray_params=self.ray_params) 363 | p1_pred_leaf = ray_classifier.predict( 364 | dX, pred_leaf=True, ray_params=self.ray_params 365 | ) 366 | p1_local = ray_classifier.to_local().predict(lX) 367 | s1 = accuracy_score(ly, p1) 368 | 369 | local_classifier = lgb.LGBMClassifier(**params) 370 | local_classifier.fit( 371 | lX, 372 | ly, 373 | sample_weight=lw, 374 | eval_set=eval_set, 375 | eval_sample_weight=eval_weights, 376 | callbacks=callbacks, 377 | ) 378 | p2 = local_classifier.predict(lX) 379 | p2_proba = local_classifier.predict_proba(lX) 380 | s2 = local_classifier.score(lX, ly) 381 | 382 | self.assertTrue(np.allclose(s1, s2)) 383 | self.assertTrue(np.allclose(p1, p2)) 384 | self.assertTrue(np.allclose(p1, ly)) 385 | self.assertTrue(np.allclose(p2, ly)) 386 | self.assertTrue(np.allclose(p1_proba, p2_proba, atol=0.1)) 387 | self.assertTrue(np.allclose(p1_local, p2)) 388 | self.assertTrue(np.allclose(p1_local, ly)) 389 | 390 | # pref_leaf values should have the right shape 391 | # and values that look like valid tree nodes 392 | pred_leaf_vals = p1_pred_leaf 393 | assert pred_leaf_vals.shape == ( 394 | lX.shape[0], 395 | ray_classifier.booster_.num_trees(), 396 | ) 397 | assert np.max(pred_leaf_vals) <= params["num_leaves"] 398 | assert np.min(pred_leaf_vals) >= 0 399 | assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"] 400 | 401 | # be sure LightGBM actually used at least one categorical column, 402 | # and that it was correctly treated as a categorical feature 403 | if output == "dataframe-with-categorical": 404 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 405 | tree_df = ray_classifier.booster_.trees_to_dataframe() 406 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 407 | assert node_uses_cat_col.sum() > 0 408 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 409 | 410 | @parameterized.expand( 411 | list( 412 | itertools.product( 413 | data_output, 414 | ["binary-classification", "multiclass-classification"], 415 | ) 416 | ) 417 | ) 418 | def testClassifierPredContrib(self, output, task): 419 | ray.init(num_cpus=4, num_gpus=0) 420 | 421 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 422 | objective=task, output=output 423 | ) 424 | 425 | params = { 426 | "n_estimators": 10, 427 | "num_leaves": 10, 428 | "random_state": 1, 429 | "deterministic": True, 430 | } 431 | 432 | ray_classifier = RayLGBMClassifier(tree_learner="data", **params) 433 | ray_classifier = ray_classifier.fit( 434 | dX, dy, sample_weight=dw, ray_params=self.ray_params 435 | ) 436 | preds_with_contrib = ray_classifier.predict( 437 | dX, pred_contrib=True, ray_params=self.ray_params 438 | ) 439 | 440 | local_classifier = lgb.LGBMClassifier(**params) 441 | if "raydmatrix" in output: 442 | lX = X 443 | ly = y 444 | lw = w 445 | else: 446 | lX = dX 447 | ly = dy 448 | lw = dw 449 | local_classifier.fit(lX, ly, sample_weight=lw) 450 | local_preds_with_contrib = local_classifier.predict(lX, pred_contrib=True) 451 | 452 | # be sure LightGBM actually used at least one categorical column, 453 | # and that it was correctly treated as a categorical feature 454 | if output == "dataframe-with-categorical": 455 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 456 | tree_df = ray_classifier.booster_.trees_to_dataframe() 457 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 458 | assert node_uses_cat_col.sum() > 0 459 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 460 | 461 | # shape depends on whether it is binary or multiclass classification 462 | num_features = ray_classifier.n_features_ 463 | num_classes = ray_classifier.n_classes_ 464 | if num_classes == 2: 465 | expected_num_cols = num_features + 1 466 | else: 467 | expected_num_cols = (num_features + 1) * num_classes 468 | 469 | # * shape depends on whether it is binary or multiclass classification 470 | # * matrix for binary classification is of the form [feature_contrib, 471 | # base_value], 472 | # for multi-class it"s [feat_contrib_class1, base_value_class1, 473 | # feat_contrib_class2, base_value_class2, etc.] 474 | # * contrib outputs for distributed training are different than from 475 | # local training, so we can just test 476 | # that the output has the right shape and base values are in the 477 | # right position 478 | assert preds_with_contrib.shape[1] == expected_num_cols 479 | assert preds_with_contrib.shape == local_preds_with_contrib.shape 480 | 481 | if num_classes == 2: 482 | assert len(np.unique(preds_with_contrib[:, num_features]) == 1) 483 | else: 484 | for i in range(num_classes): 485 | base_value_col = num_features * (i + 1) + i 486 | assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1) 487 | 488 | @parameterized.expand( 489 | list( 490 | itertools.product( 491 | data_output, 492 | boosting_types, 493 | distributed_training_algorithms, 494 | ) 495 | ) 496 | ) 497 | def testRegressor(self, output, boosting_type, tree_learner): 498 | ray.init(num_cpus=4, num_gpus=0) 499 | 500 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 501 | objective="regression", output=output 502 | ) 503 | 504 | eval_weights = [dw_test] 505 | if dy_test is None: 506 | dy_test = "test" 507 | eval_weights = None 508 | eval_set = [(dX_test, dy_test)] 509 | 510 | if "raydmatrix" in output: 511 | lX = X 512 | ly = y 513 | lw = w 514 | else: 515 | lX = dX 516 | ly = dy 517 | lw = dw 518 | 519 | params = { 520 | "boosting_type": boosting_type, 521 | "random_state": 42, 522 | "num_leaves": 31, 523 | "n_estimators": 20, 524 | "deterministic": True, 525 | } 526 | if boosting_type == "rf": 527 | params.update( 528 | { 529 | "bagging_freq": 1, 530 | "bagging_fraction": 0.9, 531 | } 532 | ) 533 | 534 | ray_regressor = RayLGBMRegressor(tree=tree_learner, **params) 535 | ray_regressor = ray_regressor.fit( 536 | dX, 537 | dy, 538 | sample_weight=dw, 539 | ray_params=self.ray_params, 540 | eval_set=eval_set, 541 | eval_sample_weight=eval_weights, 542 | ) 543 | ray_regressor = ray_regressor.fit( 544 | dX, 545 | dy, 546 | sample_weight=dw, 547 | ray_params=self.ray_params, 548 | ) 549 | p1 = ray_regressor.predict(dX, ray_params=self.ray_params) 550 | p1_pred_leaf = ray_regressor.predict( 551 | dX, pred_leaf=True, ray_params=self.ray_params 552 | ) 553 | 554 | s1 = r2_score(ly, p1) 555 | p1_local = ray_regressor.to_local().predict(lX) 556 | s1_local = ray_regressor.to_local().score(lX, ly) 557 | 558 | local_regressor = lgb.LGBMRegressor(**params) 559 | local_regressor.fit( 560 | lX, 561 | ly, 562 | sample_weight=lw, 563 | ) 564 | s2 = local_regressor.score(lX, ly) 565 | p2 = local_regressor.predict(lX) 566 | 567 | # Scores should be the same 568 | self.assertTrue(np.allclose(s1, s2, atol=0.01)) 569 | self.assertTrue(np.allclose(s1, s1_local)) 570 | 571 | # Predictions should be roughly the same. 572 | self.assertTrue(np.allclose(p1, p1_local)) 573 | 574 | # pref_leaf values should have the right shape 575 | # and values that look like valid tree nodes 576 | pred_leaf_vals = p1_pred_leaf 577 | assert pred_leaf_vals.shape == (lX.shape[0], ray_regressor.booster_.num_trees()) 578 | assert np.max(pred_leaf_vals) <= params["num_leaves"] 579 | assert np.min(pred_leaf_vals) >= 0 580 | assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"] 581 | 582 | self.assertTrue(np.allclose(p2, ly, rtol=0.5, atol=50.0)) 583 | self.assertTrue(np.allclose(p1, ly, rtol=0.5, atol=50.0)) 584 | 585 | # be sure LightGBM actually used at least one categorical column, 586 | # and that it was correctly treated as a categorical feature 587 | if output == "dataframe-with-categorical": 588 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 589 | tree_df = ray_regressor.booster_.trees_to_dataframe() 590 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 591 | assert node_uses_cat_col.sum() > 0 592 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 593 | 594 | @parameterized.expand(data_output_local) 595 | def testRegressorEarlyStopping(self, output): 596 | ray.init(num_cpus=4, num_gpus=0) 597 | 598 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 599 | objective="regression", output=output 600 | ) 601 | 602 | eval_weights = [dw_test] 603 | if dy_test is None: 604 | dy_test = "test" 605 | eval_weights = None 606 | eval_set = [(dX_test, dy_test)] 607 | 608 | if "raydmatrix" in output: 609 | lX = X 610 | ly = y 611 | lw = w 612 | else: 613 | lX = dX 614 | ly = dy 615 | lw = dw 616 | 617 | n_estimators = 400 618 | params = { 619 | "random_state": 42, 620 | "num_leaves": 31, 621 | "n_estimators": n_estimators, 622 | "deterministic": True, 623 | } 624 | 625 | callbacks = [lgb.early_stopping(1)] 626 | 627 | ray_regressor = RayLGBMRegressor(**params) 628 | ray_regressor = ray_regressor.fit( 629 | dX, 630 | dy, 631 | sample_weight=dw, 632 | ray_params=self.ray_params, 633 | eval_set=eval_set, 634 | eval_sample_weight=eval_weights, 635 | callbacks=callbacks, 636 | ) 637 | 638 | self.assertLess( 639 | len(list(ray_regressor.evals_result_["valid_0"].values())[0]), n_estimators 640 | ) 641 | 642 | p1 = ray_regressor.predict(dX, ray_params=self.ray_params) 643 | p1_pred_leaf = ray_regressor.predict( 644 | dX, pred_leaf=True, ray_params=self.ray_params 645 | ) 646 | 647 | s1 = r2_score(ly, p1) 648 | p1_local = ray_regressor.to_local().predict(lX) 649 | s1_local = ray_regressor.to_local().score(lX, ly) 650 | 651 | local_regressor = lgb.LGBMRegressor(**params) 652 | local_regressor.fit( 653 | lX, 654 | ly, 655 | sample_weight=lw, 656 | eval_set=eval_set, 657 | eval_sample_weight=eval_weights, 658 | callbacks=callbacks, 659 | ) 660 | s2 = local_regressor.score(lX, ly) 661 | p2 = local_regressor.predict(lX) 662 | 663 | # Scores should be the same 664 | self.assertTrue(np.allclose(s1, s2, atol=0.01)) 665 | self.assertTrue(np.allclose(s1, s1_local)) 666 | 667 | # Predictions should be roughly the same. 668 | self.assertTrue(np.allclose(p1, p1_local)) 669 | 670 | # pref_leaf values should have the right shape 671 | # and values that look like valid tree nodes 672 | pred_leaf_vals = p1_pred_leaf 673 | assert pred_leaf_vals.shape == (lX.shape[0], ray_regressor.booster_.num_trees()) 674 | assert np.max(pred_leaf_vals) <= params["num_leaves"] 675 | assert np.min(pred_leaf_vals) >= 0 676 | assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"] 677 | 678 | self.assertTrue(np.allclose(p2, ly, rtol=0.5, atol=50.0)) 679 | self.assertTrue(np.allclose(p1, ly, rtol=0.5, atol=50.0)) 680 | 681 | # be sure LightGBM actually used at least one categorical column, 682 | # and that it was correctly treated as a categorical feature 683 | if output == "dataframe-with-categorical": 684 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 685 | tree_df = ray_regressor.booster_.trees_to_dataframe() 686 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 687 | assert node_uses_cat_col.sum() > 0 688 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 689 | 690 | @parameterized.expand(data_output) 691 | def testRegressorPredContrib(self, output): 692 | ray.init(num_cpus=4, num_gpus=0) 693 | 694 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 695 | objective="regression", output=output 696 | ) 697 | 698 | if "raydmatrix" in output: 699 | lX = X 700 | ly = y 701 | lw = w 702 | else: 703 | lX = dX 704 | ly = dy 705 | lw = dw 706 | 707 | params = { 708 | "n_estimators": 10, 709 | "num_leaves": 10, 710 | "random_state": 1, 711 | "deterministic": True, 712 | } 713 | 714 | ray_regressor = RayLGBMRegressor(tree_learner="data", **params) 715 | ray_regressor = ray_regressor.fit( 716 | dX, dy, sample_weight=dw, ray_params=self.ray_params 717 | ) 718 | preds_with_contrib = ray_regressor.predict( 719 | dX, pred_contrib=True, ray_params=self.ray_params 720 | ) 721 | 722 | local_regressor = lgb.LGBMRegressor(**params) 723 | local_regressor.fit(lX, ly, sample_weight=lw) 724 | local_preds_with_contrib = local_regressor.predict(lX, pred_contrib=True) 725 | 726 | # contrib outputs for distributed training are different than 727 | # from local training, so we can just test 728 | # that the output has the right shape and base values are in 729 | # the right position 730 | num_features = lX.shape[1] 731 | assert preds_with_contrib.shape[1] == num_features + 1 732 | assert preds_with_contrib.shape == local_preds_with_contrib.shape 733 | 734 | # be sure LightGBM actually used at least one categorical column, 735 | # and that it was correctly treated as a categorical feature 736 | if output == "dataframe-with-categorical": 737 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 738 | tree_df = ray_regressor.booster_.trees_to_dataframe() 739 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 740 | assert node_uses_cat_col.sum() > 0 741 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 742 | 743 | @parameterized.expand(list(itertools.product(data_output, [0.1, 0.5, 0.9]))) 744 | def testRegressorQuantile(self, output, alpha): 745 | ray.init(num_cpus=4, num_gpus=0) 746 | 747 | X, y, w, _, dX, dy, dw, _, dX_test, dy_test, dw_test = _create_data( 748 | objective="regression", output=output 749 | ) 750 | 751 | params = { 752 | "objective": "quantile", 753 | "alpha": alpha, 754 | "random_state": 42, 755 | "n_estimators": 10, 756 | "num_leaves": 10, 757 | "deterministic": True, 758 | } 759 | 760 | if "raydmatrix" in output: 761 | lX = X 762 | ly = y 763 | lw = w 764 | else: 765 | lX = dX 766 | ly = dy 767 | lw = dw 768 | 769 | ray_regressor = RayLGBMRegressor(tree_learner_type="data_parallel", **params) 770 | ray_regressor = ray_regressor.fit( 771 | dX, dy, sample_weight=dw, ray_params=self.ray_params 772 | ) 773 | p1 = ray_regressor.predict(dX, ray_params=self.ray_params) 774 | q1 = np.count_nonzero(ly < p1) / ly.shape[0] 775 | 776 | local_regressor = lgb.LGBMRegressor(**params) 777 | local_regressor.fit(lX, ly, sample_weight=lw) 778 | p2 = local_regressor.predict(lX) 779 | q2 = np.count_nonzero(ly < p2) / ly.shape[0] 780 | 781 | # Quantiles should be right 782 | np.testing.assert_allclose(q1, alpha, atol=0.2) 783 | np.testing.assert_allclose(q2, alpha, atol=0.2) 784 | 785 | # be sure LightGBM actually used at least one categorical column, 786 | # and that it was correctly treated as a categorical feature 787 | if output == "dataframe-with-categorical": 788 | cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"] 789 | tree_df = ray_regressor.booster_.trees_to_dataframe() 790 | node_uses_cat_col = tree_df["split_feature"].isin(cat_cols) 791 | assert node_uses_cat_col.sum() > 0 792 | assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "==" 793 | 794 | @parameterized.expand( 795 | list( 796 | itertools.product( 797 | estimators_to_test, 798 | sklearn_checks_to_run(), 799 | ) 800 | ) 801 | ) 802 | def testSklearnIntegration(self, estimator, check): 803 | estimator = estimator() 804 | estimator.set_params(local_listen_port=18000, time_out=5) 805 | name = type(estimator).__name__ 806 | check(name, estimator) 807 | 808 | 809 | if __name__ == "__main__": 810 | import sys 811 | 812 | import pytest 813 | 814 | sys.exit(pytest.main(["-v", __file__])) 815 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/test_lightgbm_api.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from typing import Tuple 3 | 4 | import lightgbm 5 | import numpy as np 6 | import ray 7 | from lightgbm.basic import _ConfigAliases 8 | from lightgbm.callback import CallbackEnv 9 | from xgboost_ray.session import put_queue 10 | 11 | from lightgbm_ray import RayDMatrix, RayParams, RayShardingMode, train 12 | from lightgbm_ray.tune import _TuneLGBMRank0Mixin 13 | 14 | 15 | def gradient(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: 16 | return (np.log1p(y_pred) - np.log1p(y_true)) / (y_pred + 1) 17 | 18 | 19 | def hessian(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: 20 | return (-np.log1p(y_pred) + np.log1p(y_true) + 1) / np.power(y_pred + 1, 2) 21 | 22 | 23 | def squared_log( 24 | y_true: np.ndarray, y_pred: np.ndarray 25 | ) -> Tuple[np.ndarray, np.ndarray]: 26 | y_pred[y_pred < -1] = -1 + 1e-6 27 | grad = gradient(y_pred, y_true) 28 | hess = hessian(y_pred, y_true) 29 | return grad, hess 30 | 31 | 32 | def rmsle(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[str, float]: 33 | y_pred[y_pred < -1] = -1 + 1e-6 34 | elements = np.power(np.log1p(y_true) - np.log1p(y_pred), 2) 35 | return "PyRMSLE", float(np.sqrt(np.sum(elements) / len(y_true))), False 36 | 37 | 38 | class LightGBMAPITest(unittest.TestCase): 39 | """This test suite validates core LightGBM API functionality.""" 40 | 41 | def setUp(self): 42 | repeat = 128 # Repeat data a couple of times for stability 43 | self.x = np.array( 44 | [ 45 | [1, 0, 0, 0], # Feature 0 -> Label 0 46 | [0, 1, 0, 0], # Feature 1 -> Label 1 47 | [0, 0, 1, 1], # Feature 2+3 -> Label 0 48 | [0, 0, 1, 0], # Feature 2+!3 -> Label 1 49 | ] 50 | * repeat 51 | ) 52 | self.y = np.array([0, 1, 0, 1] * repeat) 53 | 54 | self.params = { 55 | "nthread": 2, 56 | "objective": "binary", 57 | "random_state": 1000, 58 | "deterministic": True, 59 | } 60 | 61 | self.kwargs = {} 62 | 63 | def tearDown(self) -> None: 64 | ray.shutdown() 65 | 66 | def _init_ray(self): 67 | ray.init(num_cpus=4, num_gpus=0) 68 | 69 | def testNumBoostRoundsValidation(self): 70 | """Ensure that an exception is thrown if num_iterations is passed 71 | as a parameter.""" 72 | self._init_ray() 73 | 74 | for param_alias in _ConfigAliases.get("num_iterations"): 75 | with self.assertRaisesRegex(ValueError, "num_boost_round"): 76 | params = self.params.copy() 77 | params[param_alias] = 10 78 | train( 79 | params, 80 | RayDMatrix(self.x, self.y, sharding=RayShardingMode.BATCH), 81 | ray_params=RayParams(num_actors=2), 82 | **self.kwargs, 83 | ) 84 | 85 | def testCustomObjectiveFunction(self): 86 | """Ensure that custom objective functions work. 87 | 88 | Runs a custom objective function with pure LightGBM and 89 | LightGBM on Ray and compares the prediction outputs.""" 90 | self._init_ray() 91 | 92 | params = self.params.copy() 93 | params["objective"] = squared_log 94 | 95 | model_lgbm = lightgbm.LGBMModel(**params).fit(self.x, self.y) 96 | 97 | model_ray = train( 98 | params, 99 | RayDMatrix(self.x, self.y, sharding=RayShardingMode.BATCH), 100 | ray_params=RayParams(num_actors=2), 101 | num_boost_round=100, 102 | **self.kwargs, 103 | ) 104 | 105 | pred_y_lgbm = np.round(model_lgbm.predict(self.x)) 106 | pred_y_ray = np.round(model_ray.predict(self.x)) 107 | 108 | self.assertSequenceEqual(list(pred_y_lgbm), list(pred_y_ray)) 109 | self.assertSequenceEqual(list(self.y.astype(float)), list(pred_y_ray * -1)) 110 | 111 | def testCustomMetricFunction(self): 112 | """Ensure that custom objective functions work. 113 | 114 | Runs a custom objective function with pure LightGBM and 115 | LightGBM on Ray and compares the prediction outputs.""" 116 | self._init_ray() 117 | 118 | params = self.params.copy() 119 | params["objective"] = squared_log 120 | 121 | model_lgbm = lightgbm.LGBMModel(**params).fit( 122 | self.x, 123 | self.y, 124 | eval_metric=[rmsle], 125 | eval_set=[(self.x, self.y)], 126 | eval_names=["dtrain"], 127 | ) 128 | evals_result_lgbm = model_lgbm.evals_result_ 129 | 130 | dtrain_ray = RayDMatrix(self.x, self.y, sharding=RayShardingMode.BATCH) 131 | evals_result_ray = {} 132 | train( 133 | params, 134 | dtrain_ray, 135 | ray_params=RayParams(num_actors=2), 136 | eval_metric=[rmsle], 137 | evals=[(dtrain_ray, "dtrain")], 138 | evals_result=evals_result_ray, 139 | num_boost_round=100, 140 | **self.kwargs, 141 | ) 142 | 143 | print(evals_result_ray["dtrain"]["PyRMSLE"]) 144 | print(evals_result_lgbm["dtrain"]["PyRMSLE"]) 145 | 146 | self.assertTrue( 147 | np.allclose( 148 | evals_result_lgbm["dtrain"]["PyRMSLE"], 149 | evals_result_ray["dtrain"]["PyRMSLE"], 150 | atol=0.1, 151 | ) 152 | ) 153 | 154 | def testCallbacks(self): 155 | self._init_ray() 156 | 157 | class _Callback(_TuneLGBMRank0Mixin): 158 | def __call__(self, env: CallbackEnv) -> None: 159 | print(f"My rank: {self.is_rank_0}") 160 | put_queue(("rank", self.is_rank_0)) 161 | 162 | callback = _Callback() 163 | 164 | additional_results = {} 165 | train( 166 | self.params, 167 | RayDMatrix(self.x, self.y), 168 | ray_params=RayParams(num_actors=2), 169 | callbacks=[callback], 170 | additional_results=additional_results, 171 | **self.kwargs, 172 | ) 173 | 174 | self.assertEqual(len(additional_results["callback_returns"]), 2) 175 | self.assertTrue( 176 | all(rank is True for (_, rank) in additional_results["callback_returns"][0]) 177 | ) 178 | self.assertTrue( 179 | all( 180 | rank is False for (_, rank) in additional_results["callback_returns"][1] 181 | ) 182 | ) 183 | 184 | 185 | if __name__ == "__main__": 186 | import sys 187 | 188 | import pytest 189 | 190 | sys.exit(pytest.main(["-v", __file__])) 191 | -------------------------------------------------------------------------------- /lightgbm_ray/tests/test_tune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | from unittest.mock import MagicMock, patch 6 | 7 | import numpy as np 8 | import ray 9 | from ray import tune 10 | from ray.tune.integration.lightgbm import ( 11 | TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, 12 | ) 13 | 14 | from lightgbm_ray import RayDMatrix, RayParams, RayShardingMode, train 15 | from lightgbm_ray.tune import TuneReportCheckpointCallback, _try_add_tune_callback 16 | 17 | 18 | class LightGBMRayTuneTest(unittest.TestCase): 19 | def setUp(self): 20 | repeat = 64 # Repeat data a couple of times for stability 21 | x = np.array( 22 | [ 23 | [1, 0, 0, 0], # Feature 0 -> Label 0 24 | [0, 1, 0, 0], # Feature 1 -> Label 1 25 | [0, 0, 1, 1], # Feature 2+3 -> Label 2 26 | [0, 0, 1, 0], # Feature 2+!3 -> Label 3 27 | ] 28 | * repeat 29 | ) 30 | y = np.array([0, 1, 2, 3] * repeat) 31 | 32 | self.params = { 33 | "lgbm": { 34 | "boosting": "gbdt", 35 | "objective": "multiclass", 36 | "num_class": 4, 37 | "random_state": 1, 38 | "tree_learner": "data", 39 | "metrics": ["multi_logloss", "multi_error"], 40 | }, 41 | "num_boost_round": tune.choice([1, 3]), 42 | } 43 | 44 | def train_func(ray_params, callbacks=None): 45 | def _inner_train(config): 46 | train_set = RayDMatrix(x, y, sharding=RayShardingMode.BATCH) 47 | train( 48 | config["lgbm"], 49 | dtrain=train_set, 50 | ray_params=ray_params, 51 | num_boost_round=config["num_boost_round"], 52 | evals=[(train_set, "train")], 53 | callbacks=callbacks, 54 | ) 55 | 56 | return _inner_train 57 | 58 | self.train_func = train_func 59 | self.experiment_dir = tempfile.mkdtemp() 60 | 61 | def tearDown(self): 62 | ray.shutdown() 63 | shutil.rmtree(self.experiment_dir) 64 | 65 | # noinspection PyTypeChecker 66 | @patch.dict(os.environ, {"TUNE_RESULT_DELIM": "/"}) 67 | def testNumIters(self, init=True): 68 | """Test that the number of reported tune results is correct""" 69 | if init: 70 | ray.init(num_cpus=8) 71 | ray_params = RayParams(cpus_per_actor=2, num_actors=2) 72 | params = self.params.copy() 73 | params["num_boost_round"] = tune.grid_search([1, 3]) 74 | analysis = tune.run( 75 | self.train_func(ray_params), 76 | config=self.params, 77 | resources_per_trial=ray_params.get_tune_resources(), 78 | num_samples=1, 79 | ) 80 | 81 | print(analysis.results_df.columns) 82 | self.assertSequenceEqual( 83 | list(analysis.results_df["training_iteration"]), 84 | list(analysis.results_df["config/num_boost_round"]), 85 | ) 86 | 87 | def testNumItersClient(self): 88 | """Test ray client mode""" 89 | ray.init(num_cpus=8) 90 | if ray.__version__ <= "1.2.0": 91 | self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") 92 | 93 | from ray.util.client.ray_client_helpers import ray_start_client_server 94 | 95 | self.assertFalse(ray.util.client.ray.is_connected()) 96 | with ray_start_client_server(): 97 | self.assertTrue(ray.util.client.ray.is_connected()) 98 | self.testNumIters(init=False) 99 | 100 | def testReplaceTuneCheckpoints(self): 101 | """Test if ray.tune.integration.lightgbm callbacks are replaced""" 102 | ray.init(num_cpus=4) 103 | 104 | # Report and checkpointing callback 105 | in_cp = [OrigTuneReportCheckpointCallback(metrics="met")] 106 | in_dict = {"callbacks": in_cp} 107 | 108 | with patch("ray.train.get_context") as mocked: 109 | mocked.return_value = MagicMock(return_value=True) 110 | _try_add_tune_callback(in_dict) 111 | 112 | replaced = in_dict["callbacks"][0] 113 | self.assertTrue(isinstance(replaced, TuneReportCheckpointCallback)) 114 | 115 | self.assertSequenceEqual(replaced._metrics, ["met"]) 116 | 117 | def testEndToEndCheckpointing(self): 118 | ray.init(num_cpus=4) 119 | ray_params = RayParams(cpus_per_actor=2, num_actors=1) 120 | analysis = tune.run( 121 | self.train_func( 122 | ray_params, callbacks=[TuneReportCheckpointCallback(frequency=1)] 123 | ), 124 | config=self.params, 125 | resources_per_trial=ray_params.get_tune_resources(), 126 | num_samples=1, 127 | metric="train-multi_logloss", 128 | mode="min", 129 | log_to_file=True, 130 | local_dir=self.experiment_dir, 131 | ) 132 | 133 | self.assertTrue(os.path.exists(analysis.best_checkpoint.path)) 134 | 135 | def testEndToEndCheckpointingOrigTune(self): 136 | ray.init(num_cpus=4) 137 | ray_params = RayParams(cpus_per_actor=2, num_actors=1) 138 | analysis = tune.run( 139 | self.train_func( 140 | ray_params, callbacks=[OrigTuneReportCheckpointCallback(frequency=1)] 141 | ), 142 | config=self.params, 143 | resources_per_trial=ray_params.get_tune_resources(), 144 | num_samples=1, 145 | metric="train-multi_logloss", 146 | mode="min", 147 | log_to_file=True, 148 | local_dir=self.experiment_dir, 149 | ) 150 | 151 | self.assertTrue(os.path.exists(analysis.best_checkpoint.path)) 152 | 153 | 154 | if __name__ == "__main__": 155 | import sys 156 | 157 | import pytest 158 | 159 | sys.exit(pytest.main(["-v", __file__])) 160 | -------------------------------------------------------------------------------- /lightgbm_ray/tune.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict 3 | 4 | import ray 5 | from lightgbm.basic import Booster 6 | from lightgbm.callback import CallbackEnv 7 | from ray.util.annotations import PublicAPI 8 | from xgboost_ray.session import put_queue 9 | from xgboost_ray.util import force_on_current_node 10 | 11 | try: 12 | import ray.train 13 | import ray.tune 14 | except (ImportError, ModuleNotFoundError) as e: 15 | raise RuntimeError( 16 | "Ray Train and Ray Tune are required dependencies of `lightgbm_ray.tune` " 17 | 'Please install with: `pip install "ray[train]"`' 18 | ) from e 19 | 20 | 21 | from ray.tune.integration.lightgbm import TuneReportCallback as OrigTuneReportCallback 22 | from ray.tune.integration.lightgbm import ( 23 | TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, 24 | ) 25 | 26 | 27 | class _TuneLGBMRank0Mixin: 28 | """Mixin to allow for dynamic setting of rank so that only 29 | one actor actually fires the callback""" 30 | 31 | @property 32 | def is_rank_0(self) -> bool: 33 | try: 34 | return self._is_rank_0 35 | except AttributeError: 36 | return True 37 | 38 | @is_rank_0.setter 39 | def is_rank_0(self, val: bool): 40 | self._is_rank_0 = val 41 | 42 | 43 | class TuneReportCheckpointCallback( 44 | _TuneLGBMRank0Mixin, OrigTuneReportCheckpointCallback 45 | ): 46 | def __call__(self, env: CallbackEnv): 47 | if self.is_rank_0: 48 | put_queue( 49 | lambda: super(TuneReportCheckpointCallback, self).__call__(env=env) 50 | ) 51 | 52 | 53 | class TuneReportCallback(_TuneLGBMRank0Mixin, OrigTuneReportCallback): 54 | def __new__(cls: type, *args, **kwargs): 55 | # TODO(justinvyu): [code_removal] Remove in Ray 2.11. 56 | raise DeprecationWarning( 57 | "`TuneReportCallback` is deprecated. " 58 | "Use `ray.tune.integration.lightgbm.TuneReportCheckpointCallback` instead." 59 | ) 60 | 61 | 62 | def _try_add_tune_callback(kwargs: Dict): 63 | ray_train_context_initialized = ( 64 | ray.train.get_context().get_trial_resources() is not None 65 | ) 66 | if ray_train_context_initialized: 67 | callbacks = kwargs.get("callbacks", []) or [] 68 | new_callbacks = [] 69 | has_tune_callback = False 70 | 71 | REPLACE_MSG = ( 72 | "Replaced `{orig}` with `{target}`. If you want to " 73 | "avoid this warning, pass `{target}` as a callback " 74 | "directly in your calls to `lightgbm_ray.train()`." 75 | ) 76 | 77 | for cb in callbacks: 78 | if isinstance(cb, TuneReportCheckpointCallback): 79 | has_tune_callback = True 80 | new_callbacks.append(cb) 81 | elif isinstance(cb, OrigTuneReportCheckpointCallback): 82 | orig_metrics = cb._metrics 83 | orig_frequency = cb._frequency 84 | 85 | replace_cb = TuneReportCheckpointCallback( 86 | metrics=orig_metrics, 87 | frequency=orig_frequency, 88 | ) 89 | new_callbacks.append(replace_cb) 90 | logging.warning( 91 | REPLACE_MSG.format( 92 | orig="ray.tune.integration.lightgbm." 93 | "TuneReportCheckpointCallback", 94 | target="lightgbm_ray.tune.TuneReportCheckpointCallback", 95 | ) 96 | ) 97 | has_tune_callback = True 98 | else: 99 | new_callbacks.append(cb) 100 | 101 | if not has_tune_callback: 102 | new_callbacks.append(TuneReportCheckpointCallback(frequency=0)) 103 | 104 | kwargs["callbacks"] = new_callbacks 105 | return True 106 | else: 107 | return False 108 | 109 | 110 | @PublicAPI(stability="beta") 111 | def load_model(model_path): 112 | """Loads the model stored in the provided model_path. 113 | 114 | If using Ray Client, this will automatically handle loading the path on 115 | the server by using a Ray task. 116 | 117 | Returns: 118 | lightgbm.Booster object of the model stored in the provided model_path 119 | 120 | """ 121 | 122 | def load_model_fn(model_path): 123 | best_bst = Booster(model_file=model_path) 124 | return best_bst 125 | 126 | # Load the model checkpoint. 127 | if ray.util.client.ray.is_connected(): 128 | # If using Ray Client, the best model is saved on the server. 129 | # So we have to wrap the model loading in a ray task. 130 | remote_load = ray.remote(load_model_fn) 131 | remote_load = force_on_current_node(remote_load) 132 | bst = ray.get(remote_load.remote(model_path)) 133 | else: 134 | bst = load_model_fn(model_path) 135 | 136 | return bst 137 | -------------------------------------------------------------------------------- /lightgbm_ray/util.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import gc 3 | import socket 4 | from contextlib import closing 5 | 6 | from lightgbm.basic import _LIB, _safe_call 7 | 8 | 9 | class lgbm_network_free: 10 | """Context to ensure LGBM_NetworkFree() is called 11 | (makes sure network is cleaned and ports are 12 | opened even if training fails).""" 13 | 14 | def __init__(self, model) -> None: 15 | self.model = model 16 | return 17 | 18 | def __enter__(self) -> None: 19 | return 20 | 21 | def __exit__(self, type, value, traceback): 22 | try: 23 | self.model._Booster.free_network() 24 | except Exception: 25 | pass 26 | _safe_call(_LIB.LGBM_NetworkFree()) 27 | # doesn't clean up properly without gc collect 28 | gc.collect() 29 | 30 | 31 | def find_free_port() -> int: 32 | """Find random free port.""" 33 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 34 | s.bind(("", 0)) 35 | s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 36 | return s.getsockname()[1] 37 | 38 | 39 | def is_port_free(port: int) -> bool: 40 | """Check if port is free""" 41 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 42 | try: 43 | s.bind(("", port)) 44 | except socket.error as e: 45 | if e.errno == errno.EADDRINUSE: 46 | return False 47 | raise e 48 | return True 49 | -------------------------------------------------------------------------------- /requirements/lint-requirements.txt: -------------------------------------------------------------------------------- 1 | flake8==3.9.1 2 | flake8-comprehensions==3.10.1 3 | flake8-quotes==2.0.0 4 | flake8-bugbear==21.9.2 5 | black==22.10.0 6 | isort==5.10.1 7 | importlib-metadata==4.13.0 8 | -------------------------------------------------------------------------------- /requirements/test-requirements.txt: -------------------------------------------------------------------------------- 1 | packaging 2 | parameterized 3 | petastorm 4 | pytest 5 | pyarrow<15.0.0 6 | ray[tune, data, default] 7 | scikit-learn 8 | # modin==0.23.1.post0 is not compatible with lightgbm_ray py38 9 | modin<=0.23.1; python_version == '3.8' 10 | # modin==0.26.0 is not compatible with lightgbm_ray py39+ 11 | modin<0.26.0; python_version > '3.8' 12 | git+https://github.com/ray-project/xgboost_ray.git 13 | 14 | #workaround for now 15 | protobuf<4.0.0 16 | tensorboardX==2.2 17 | -------------------------------------------------------------------------------- /run_ci_examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | TUNE=1 6 | 7 | for i in "$@" 8 | do 9 | echo "$i" 10 | case "$i" in 11 | --no-tune) 12 | TUNE=0 13 | ;; 14 | *) 15 | echo "unknown arg, $i" 16 | exit 1 17 | ;; 18 | esac 19 | done 20 | 21 | pushd lightgbm_ray/examples/ || exit 1 22 | ray stop || true 23 | echo "================" 24 | echo "Running examples" 25 | echo "================" 26 | echo "running readme.py" && python readme.py 27 | echo "running readme_sklearn_api.py" && python readme_sklearn_api.py 28 | echo "running simple.py" && python simple.py --smoke-test 29 | echo "running simple_predict.py" && python simple_predict.py 30 | echo "running simple_dask.py" && python simple_dask.py --smoke-test 31 | echo "running simple_modin.py" && python simple_modin.py --smoke-test 32 | echo "running simple_ray_dataset.py" && python simple_ray_dataset.py --smoke-test 33 | 34 | if [ "$TUNE" = "1" ]; then 35 | echo "running simple_tune.py" && python simple_tune.py --smoke-test 36 | else 37 | echo "skipping tune example" 38 | fi 39 | 40 | echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test 41 | popd 42 | 43 | pushd lightgbm_ray/tests 44 | echo "running examples with Ray Client" 45 | python -m pytest -v --durations=0 -x test_client.py 46 | popd || exit 1 -------------------------------------------------------------------------------- /run_ci_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TUNE=1 4 | 5 | for i in "$@" 6 | do 7 | echo "$i" 8 | case "$i" in 9 | --no-tune) 10 | TUNE=0 11 | ;; 12 | *) 13 | echo "unknown arg, $i" 14 | exit 1 15 | ;; 16 | esac 17 | done 18 | 19 | pushd lightgbm_ray/tests || exit 1 20 | echo "=============" 21 | echo "Running tests" 22 | echo "=============" 23 | END_STATUS=0 24 | if ! python -m pytest -v --durations=0 -x "test_lightgbm_api.py" ; then exit 1; fi 25 | if ! python -m pytest -v --durations=0 -x "test_end_to_end.py" ; then exit 1; fi 26 | if ! python -m pytest -v -s --durations=0 -x "test_fault_tolerance.py" ; then exit 1; fi 27 | if ! python -m pytest -v --durations=0 -x "test_lightgbm.py" ; then exit 1; fi 28 | 29 | if [ "$TUNE" = "1" ]; then 30 | if ! python -m pytest -v --durations=0 -x "test_tune.py" ; then exit 1; fi 31 | else 32 | echo "skipping tune tests" 33 | fi 34 | 35 | #echo "running smoke test on benchmark_cpu_gpu.py" && if ! python release/benchmark_cpu_gpu.py 2 10 20 --smoke-test; then END_STATUS=1; fi 36 | popd || exit 1 37 | 38 | if [ "$END_STATUS" = "1" ]; then 39 | echo "At least one test has failed, exiting with code 1" 40 | fi 41 | exit "$END_STATUS" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="lightgbm_ray", 5 | packages=find_packages(where=".", include="lightgbm_ray*"), 6 | version="0.1.10", 7 | author="Ray Team", 8 | description="A Ray backend for distributed LightGBM", 9 | license="Apache 2.0", 10 | long_description="A distributed backend for LightGBM built on top of " 11 | "distributed computing framework Ray.", 12 | url="https://github.com/ray-project/lightgbm_ray", 13 | install_requires=["lightgbm>=3.2.1", "xgboost_ray>=0.1.12", "packaging"], 14 | ) 15 | --------------------------------------------------------------------------------