├── .flake8
├── .github
    └── workflows
    │   ├── gpu.yaml
    │   └── test.yaml
├── .gitignore
├── LICENSE
├── README.md
├── examples
├── format.sh
├── requirements
    ├── lint-requirements.txt
    └── test-requirements.txt
├── run_ci_examples.sh
├── run_ci_tests.sh
├── setup.py
└── xgboost_ray
    ├── __init__.py
    ├── callback.py
    ├── compat
        ├── __init__.py
        └── tracker.py
    ├── data_sources
        ├── __init__.py
        ├── _distributed.py
        ├── csv.py
        ├── dask.py
        ├── data_source.py
        ├── modin.py
        ├── numpy.py
        ├── object_store.py
        ├── pandas.py
        ├── parquet.py
        ├── partitioned.py
        ├── petastorm.py
        └── ray_dataset.py
    ├── elastic.py
    ├── examples
        ├── __init__.py
        ├── create_test_data.py
        ├── higgs.py
        ├── higgs_parquet.py
        ├── readme.py
        ├── readme_sklearn_api.py
        ├── simple.py
        ├── simple_dask.py
        ├── simple_modin.py
        ├── simple_objectstore.py
        ├── simple_partitioned.py
        ├── simple_predict.py
        ├── simple_ray_dataset.py
        ├── simple_tune.py
        ├── train_on_test_data.py
        └── train_with_ml_dataset.py
    ├── main.py
    ├── matrix.py
    ├── session.py
    ├── sklearn.py
    ├── tests
        ├── __init__.py
        ├── conftest.py
        ├── env_info.sh
        ├── fault_tolerance.py
        ├── release
        │   ├── benchmark_cpu_gpu.py
        │   ├── benchmark_ft.py
        │   ├── cluster_cpu.yaml
        │   ├── cluster_ft.yaml
        │   ├── cluster_gpu.yaml
        │   ├── create_learnable_data.py
        │   ├── create_test_data.py
        │   ├── custom_objective_metric.py
        │   ├── run_e2e_gpu.sh
        │   ├── setup_xgboost.sh
        │   ├── start_cpu_cluster.sh
        │   ├── start_ft_cluster.sh
        │   ├── start_gpu_cluster.sh
        │   ├── submit_cpu_gpu_benchmark.sh
        │   ├── submit_ft_benchmark.sh
        │   ├── tune_cluster.yaml
        │   └── tune_placement.py
        ├── test_client.py
        ├── test_colocation.py
        ├── test_data_source.py
        ├── test_end_to_end.py
        ├── test_fault_tolerance.py
        ├── test_matrix.py
        ├── test_sklearn.py
        ├── test_sklearn_matrix.py
        ├── test_tune.py
        ├── test_xgboost_api.py
        └── utils.py
    ├── tune.py
    ├── util.py
    └── xgb.py


/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 88
 3 | inline-quotes = "
 4 | ignore =
 5 |   C408
 6 |   C417
 7 |   E121
 8 |   E123
 9 |   E126
10 |   E203
11 |   E226
12 |   E24
13 |   E704
14 |   W503
15 |   W504
16 |   W605
17 |   I
18 |   N
19 |   B001
20 |   B002
21 |   B003
22 |   B004
23 |   B005
24 |   B007
25 |   B008
26 |   B009
27 |   B010
28 |   B011
29 |   B012
30 |   B013
31 |   B014
32 |   B015
33 |   B016
34 |   B017
35 | avoid-escape = no
36 | # Error E731 is ignored because of the migration from YAPF to Black.
37 | # See https://github.com/ray-project/ray/issues/21315 for more information.
38 | per-file-ignores =
39 |     rllib/evaluation/worker_set.py:E731
40 |     rllib/evaluation/sampler.py:E731
41 | 


--------------------------------------------------------------------------------
/.github/workflows/gpu.yaml:
--------------------------------------------------------------------------------
 1 | name: GPU on manual trigger
 2 | 
 3 | on:
 4 |   workflow_dispatch
 5 | 
 6 | jobs:
 7 |   test_gpu:
 8 |     runs-on: ubuntu-latest
 9 |     timeout-minutes: 20
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Set up Python 3.8
13 |       uses: actions/setup-python@v3
14 |       with:
15 |         python-version: 3.8
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         python -m pip install -U anyscale pyyaml
20 |     - name: Print environment info
21 |       run: |
22 |         ./xgboost_ray/tests/env_info.sh
23 |     - name: Set anyscale project
24 |       env:
25 |         ANYSCALE_PROJECT: ${{ secrets.ANYSCALE_PROJECT }}
26 |       run: |
27 |         echo "project_id: ${ANYSCALE_PROJECT}" > ./xgboost_ray/tests/release/.anyscale.yaml
28 |     - name: Run end to end GPU test
29 |       env:
30 |         ANYSCALE_CLI_TOKEN: ${{ secrets.ANYSCALE_CLI_TOKEN }}
31 |       run: |
32 |         pushd ./xgboost_ray/tests/release
33 |         ./run_e2e_gpu.sh
34 |         popd || true
35 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
  1 | name: pytest on push
  2 | 
  3 | on:
  4 |   push:
  5 |   pull_request:
  6 |   schedule:
  7 |     - cron: "0 5 * * *"
  8 | 
  9 | jobs:
 10 |   test_lint:
 11 |     runs-on: ubuntu-latest
 12 |     timeout-minutes: 3
 13 |     steps:
 14 |     - uses: actions/checkout@v3
 15 |     - name: Set up Python 3.8
 16 |       uses: actions/setup-python@v3
 17 |       with:
 18 |         python-version: 3.8
 19 |     - name: Install dependencies
 20 |       run: |
 21 |         python -m pip install --upgrade pip
 22 |         python -m pip install codecov
 23 |         if [ -f requirements/lint-requirements.txt ]; then python -m pip install -r requirements/lint-requirements.txt; fi
 24 |     - name: Print environment info
 25 |       run: |
 26 |         ./xgboost_ray/tests/env_info.sh
 27 |     - name: Run format script
 28 |       run: |
 29 |         ls -alp
 30 |         ./format.sh --all
 31 | 
 32 |   test_linux_ray_master:
 33 |     runs-on: ubuntu-latest
 34 |     timeout-minutes: 160
 35 |     strategy:
 36 |       matrix:
 37 |         python-version: ["3.8", "3.9", "3.10"]
 38 |         include:
 39 |           - python-version: "3.8"
 40 |             ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
 41 |           - python-version: "3.9"
 42 |             ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl
 43 |           - python-version: "3.10"
 44 |             ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
 45 |     steps:
 46 |     - uses: actions/checkout@v3
 47 |     - name: Set up Python ${{ matrix.python-version }}
 48 |       uses: actions/setup-python@v3
 49 |       with:
 50 |         python-version: ${{ matrix.python-version }}
 51 |     - name: Install dependencies
 52 |       run: |
 53 |         python -m pip install --upgrade pip
 54 |         python -m pip install codecov
 55 |         python -m pip install -U ${{ matrix.ray-wheel }}
 56 |         if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
 57 |     - name: Install package
 58 |       run: |
 59 |         python -m pip install -e .
 60 |     - name: Print environment info
 61 |       run: |
 62 |         ./xgboost_ray/tests/env_info.sh
 63 |     - name: Run tests
 64 |       uses: nick-invision/retry@v2
 65 |       with:
 66 |         timeout_minutes: 45
 67 |         max_attempts: 3
 68 |         command: bash ./run_ci_tests.sh
 69 |     - name: Run examples
 70 |       uses: nick-invision/retry@v2
 71 |       with:
 72 |         timeout_minutes: 10
 73 |         max_attempts: 3
 74 |         command: bash ./run_ci_examples.sh
 75 | 
 76 |   test_linux_ray_release:
 77 |     runs-on: ubuntu-latest
 78 |     timeout-minutes: 160
 79 |     strategy:
 80 |       matrix:
 81 |         python-version: ["3.8", "3.9", "3.10"]
 82 |     steps:
 83 |     - uses: actions/checkout@v3
 84 |     - name: Set up Python ${{ matrix.python-version }}
 85 |       uses: actions/setup-python@v3
 86 |       with:
 87 |         python-version: ${{ matrix.python-version }}
 88 |     - name: Install dependencies
 89 |       run: |
 90 |         python -m pip install --upgrade pip
 91 |         python -m pip install codecov
 92 |         python -m pip install -U ray
 93 |         if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
 94 |     - name: Install package
 95 |       run: |
 96 |         python -m pip install -e .
 97 |     - name: Print environment info
 98 |       run: |
 99 |         ./xgboost_ray/tests/env_info.sh
100 |     - name: Run tests
101 |       uses: nick-invision/retry@v2
102 |       with:
103 |         timeout_minutes: 45
104 |         max_attempts: 3
105 |         command: bash ./run_ci_tests.sh
106 |     - name: Run examples
107 |       uses: nick-invision/retry@v2
108 |       with:
109 |         timeout_minutes: 10
110 |         max_attempts: 3
111 |         command: bash ./run_ci_examples.sh
112 | 
113 |   test_linux_compat:
114 |     # Test compatibility when some optional libraries are missing
115 |     # Test runs on latest ray release
116 |     runs-on: ubuntu-latest
117 |     timeout-minutes: 160
118 |     strategy:
119 |       matrix:
120 |         python-version: ["3.8", "3.9", "3.10"]
121 |     steps:
122 |     - uses: actions/checkout@v3
123 |     - name: Set up Python ${{ matrix.python-version }}
124 |       uses: actions/setup-python@v3
125 |       with:
126 |         python-version: ${{ matrix.python-version }}
127 |     - name: Install dependencies
128 |       run: |
129 |         python -m pip install --upgrade pip
130 |         python -m pip install codecov
131 |         python -m pip install -U ray
132 |         if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
133 |     - name: Uninstall unavailable dependencies
134 |       # Disables modin and Ray Tune (via tabulate)
135 |       run: |
136 |         python -m pip uninstall -y modin
137 |         python -m pip uninstall -y tabulate
138 |     - name: Install package
139 |       run: |
140 |         python -m pip install -e .
141 |     - name: Print environment info
142 |       run: |
143 |         ./xgboost_ray/tests/env_info.sh
144 |     - name: Run tests
145 |       uses: nick-invision/retry@v2
146 |       with:
147 |         timeout_minutes: 45
148 |         max_attempts: 3
149 |         command: bash ./run_ci_tests.sh --no-tune
150 |     - name: Run examples
151 |       uses: nick-invision/retry@v2
152 |       with:
153 |         timeout_minutes: 10
154 |         max_attempts: 3
155 |         command: bash ./run_ci_examples.sh --no-tune
156 | 
157 |   test_linux_cutting_edge:
158 |     # Tests on cutting edge, i.e. latest Ray master, latest XGBoost master
159 |     runs-on: ubuntu-latest
160 |     timeout-minutes: 160
161 |     strategy:
162 |       matrix:
163 |         # no new versions for xgboost are published for 3.6
164 |         python-version: ["3.8", "3.9", "3.10"]
165 |         include:
166 |           - python-version: "3.8"
167 |             ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
168 |           - python-version: "3.9"
169 |             ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl
170 |           - python-version: "3.10"
171 |             ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
172 |     steps:
173 |     - uses: actions/checkout@v3
174 |     - name: Set up Python ${{ matrix.python-version }}
175 |       uses: actions/setup-python@v3
176 |       with:
177 |         python-version: ${{ matrix.python-version }}
178 |     - name: Install dependencies
179 |       run: |
180 |         python -m pip install --upgrade pip
181 |         python -m pip install codecov
182 |         python -m pip install -U ${{ matrix.ray-wheel }}
183 |         if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
184 |     - name: Install Ubuntu system dependencies
185 |       run: |
186 |         sudo apt-get install -y --no-install-recommends ninja-build
187 |     - name: Install package
188 |       run: |
189 |         python -m pip install -e .
190 |     - name: Clone XGBoost repo
191 |       uses: actions/checkout@v3
192 |       with:
193 |         repository: dmlc/xgboost
194 |         path: xgboost
195 |         submodules: true
196 |     - name: Install XGBoost from source
197 |       shell: bash -l {0}
198 |       run: |
199 |         pushd ${GITHUB_WORKSPACE}/xgboost/python-package
200 |         python --version
201 |         python setup.py sdist
202 |         pip install -v ./dist/xgboost-*.tar.gz
203 |         popd
204 |     - name: Print environment info
205 |       run: |
206 |         ./xgboost_ray/tests/env_info.sh
207 |     - name: Run tests
208 |       uses: nick-invision/retry@v2
209 |       with:
210 |         timeout_minutes: 45
211 |         max_attempts: 3
212 |         command: bash ./run_ci_tests.sh
213 |     - name: Run examples
214 |       uses: nick-invision/retry@v2
215 |       with:
216 |         timeout_minutes: 10
217 |         max_attempts: 3
218 |         command: bash ./run_ci_examples.sh
219 | 
220 |   test_linux_xgboost_legacy:
221 |     # Tests on XGBoost 0.90 and latest Ray release
222 |     runs-on: ubuntu-latest
223 |     timeout-minutes: 160
224 |     strategy:
225 |       matrix:
226 |         python-version: [3.8]
227 |     steps:
228 |     - uses: actions/checkout@v3
229 |     - name: Set up Python ${{ matrix.python-version }}
230 |       uses: actions/setup-python@v3
231 |       with:
232 |         python-version: ${{ matrix.python-version }}
233 |     - name: Install dependencies
234 |       run: |
235 |         python -m pip install --upgrade pip
236 |         python -m pip install codecov
237 |         python -m pip install -U ray
238 |         if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi
239 |     - name: Install package
240 |       run: |
241 |         python -m pip install -e .
242 |     - name: Install legacy XGBoost
243 |       run: |
244 |         python -m pip install xgboost==0.90
245 |     - name: Print environment info
246 |       run: |
247 |         ./xgboost_ray/tests/env_info.sh
248 |     - name: Run tests
249 |       uses: nick-invision/retry@v2
250 |       with:
251 |         timeout_minutes: 45
252 |         max_attempts: 3
253 |         command: bash ./run_ci_tests.sh
254 |     - name: Run examples
255 |       uses: nick-invision/retry@v2
256 |       with:
257 |         timeout_minutes: 10
258 |         max_attempts: 3
259 |         command: bash ./run_ci_examples.sh
260 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python byte code files
 2 | *.pyc
 3 | python/.eggs
 4 | 
 5 | # Backup files
 6 | *.bak
 7 | 
 8 | # Emacs temporary files
 9 | *~
10 | *#
11 | 
12 | # Debug symbols
13 | *.pdb
14 | 
15 | # Visual Studio files
16 | /packages
17 | *.suo
18 | *.user
19 | *.VC.db
20 | *.VC.opendb
21 | 
22 | # Protobuf-generated files
23 | *_pb2.py
24 | *.pb.h
25 | *.pb.cc
26 | 
27 | # Ray cluster configuration
28 | scripts/nodes.txt
29 | 
30 | # OS X folder attributes
31 | .DS_Store
32 | 
33 | # Debug files
34 | *.dSYM/
35 | *.su
36 | 
37 | # Python setup files
38 | *.egg-info
39 | 
40 | # Compressed files
41 | *.gz
42 | 
43 | # Datasets from examples
44 | **/MNIST_data/
45 | **/cifar-10-batches-bin/
46 | 
47 | # Generated documentation files
48 | /doc/_build
49 | /doc/source/_static/thumbs
50 | /doc/source/tune/generated_guides/
51 | 
52 | # User-specific stuff:
53 | .idea/
54 | 
55 | # Pytest Cache
56 | **/.pytest_cache
57 | **/.cache
58 | .benchmarks
59 | python-driver-*
60 | 
61 | # Vscode
62 | .vscode/
63 | 
64 | *.iml
65 | 
66 | # python virtual env
67 | venv
68 | 
69 | # pyenv version file
70 | .python-version
71 | 
72 | # Vim
73 | .*.swp
74 | *.swp
75 | tags
76 | 
77 | # Emacs
78 | .#*
79 | 
80 | # tools
81 | tools/prometheus*
82 | 
83 | # ray project files
84 | project-id
85 | .mypy_cache/
86 | 
87 | # XGBoost models from examples
88 | *.xgb
89 | 
90 | # Downloaded test data
91 | *.csv
92 | *.csv.gz
93 | *.parquet
94 | 
95 | # Byte-compiled files
96 | __pycache__/


--------------------------------------------------------------------------------
/examples:
--------------------------------------------------------------------------------
1 | xgboost_ray/examples


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Black + Clang formatter (if installed). This script formats all changed files from the last mergebase.
  3 | # You are encouraged to run this locally before pushing changes for review.
  4 | 
  5 | # Cause the script to exit if a single command fails
  6 | set -euo pipefail
  7 | 
  8 | FLAKE8_VERSION_REQUIRED="3.9.1"
  9 | BLACK_VERSION_REQUIRED="22.10.0"
 10 | SHELLCHECK_VERSION_REQUIRED="0.7.1"
 11 | ISORT_VERSION_REQUIRED="5.10.1"
 12 | 
 13 | check_python_command_exist() {
 14 |     VERSION=""
 15 |     case "$1" in
 16 |         black)
 17 |             VERSION=$BLACK_VERSION_REQUIRED
 18 |             ;;
 19 |         flake8)
 20 |             VERSION=$FLAKE8_VERSION_REQUIRED
 21 |             ;;
 22 |         isort)
 23 |             VERSION=$ISORT_VERSION_REQUIRED
 24 |             ;;
 25 |         *)
 26 |             echo "$1 is not a required dependency"
 27 |             exit 1
 28 |     esac
 29 |     if ! [ -x "$(command -v "$1")" ]; then
 30 |         echo "$1 not installed. Install the python package with: pip install $1==$VERSION"
 31 |         exit 1
 32 |     fi
 33 | }
 34 | 
 35 | check_docstyle() {
 36 |     echo "Checking docstyle..."
 37 |     violations=$(git ls-files | grep '.py$' | xargs grep -E '^[ ]+[a-z_]+ ?\([a-zA-Z]+\): ' | grep -v 'str(' | grep -v noqa || true)
 38 |     if [[ -n "$violations" ]]; then
 39 |         echo
 40 |         echo "=== Found Ray docstyle violations ==="
 41 |         echo "$violations"
 42 |         echo
 43 |         echo "Per the Google pydoc style, omit types from pydoc args as they are redundant: https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#code-style "
 44 |         echo "If this is a false positive, you can add a '# noqa' comment to the line to ignore."
 45 |         exit 1
 46 |     fi
 47 |     return 0
 48 | }
 49 | 
 50 | check_python_command_exist black
 51 | check_python_command_exist flake8
 52 | check_python_command_exist isort
 53 | 
 54 | # this stops git rev-parse from failing if we run this from the .git directory
 55 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
 56 | 
 57 | ROOT="$(git rev-parse --show-toplevel)"
 58 | builtin cd "$ROOT" || exit 1
 59 | 
 60 | # NOTE(edoakes): black version differs based on installation method:
 61 | #   Option 1) 'black, 21.12b0 (compiled: no)'
 62 | #   Option 2) 'black, version 21.12b0'
 63 | #   For newer versions (at least 22.10.0), a second line is printed which must be dropped:
 64 | #
 65 | #     black, 22.10.0 (compiled: yes)
 66 | #     Python (CPython) 3.9.13
 67 | BLACK_VERSION_STR=$(black --version)
 68 | if [[ "$BLACK_VERSION_STR" == *"compiled"* ]]
 69 | then
 70 |     BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $2}')
 71 | else
 72 |     BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $3}')
 73 | fi
 74 | FLAKE8_VERSION=$(flake8 --version | head -n 1 | awk '{print $1}')
 75 | ISORT_VERSION=$(isort --version | grep VERSION | awk '{print $2}')
 76 | 
 77 | # params: tool name, tool version, required version
 78 | tool_version_check() {
 79 |     if [ "$2" != "$3" ]; then
 80 |         echo "WARNING: Ray uses $1 $3, You currently are using $2. This might generate different results."
 81 |     fi
 82 | }
 83 | 
 84 | tool_version_check "flake8" "$FLAKE8_VERSION" "$FLAKE8_VERSION_REQUIRED"
 85 | tool_version_check "black" "$BLACK_VERSION" "$BLACK_VERSION_REQUIRED"
 86 | tool_version_check "isort" "$ISORT_VERSION" "$ISORT_VERSION_REQUIRED"
 87 | 
 88 | if command -v shellcheck >/dev/null; then
 89 |     SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}')
 90 |     tool_version_check "shellcheck" "$SHELLCHECK_VERSION" "$SHELLCHECK_VERSION_REQUIRED"
 91 | else
 92 |     echo "INFO: Ray uses shellcheck for shell scripts, which is not installed. You may install shellcheck=$SHELLCHECK_VERSION_REQUIRED with your system package manager."
 93 | fi
 94 | 
 95 | if command -v clang-format >/dev/null; then
 96 |   CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}')
 97 |   tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.0"
 98 | else
 99 |     echo "WARNING: clang-format is not installed!"
100 | fi
101 | 
102 | if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then
103 |     echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes"
104 | fi
105 | 
106 | if [[ $(flake8 --version) != *"flake8-bugbear"* ]]; then
107 |     echo "WARNING: Ray uses flake8 with flake8-bugbear. Might error without it. Install with: pip install flake8-bugbear"
108 | fi
109 | 
110 | SHELLCHECK_FLAGS=(
111 |   --exclude=1090  # "Can't follow non-constant source. Use a directive to specify location."
112 |   --exclude=1091  # "Not following {file} due to some error"
113 |   --exclude=2207  # "Prefer mapfile or read -a to split command output (or quote to avoid splitting)." -- these aren't compatible with macOS's old Bash
114 | )
115 | 
116 | 
117 | BLACK_EXCLUDES=(
118 |     '--force-exclude'
119 |     'python/ray/cloudpickle/*|'`
120 |     `'python/build/*|'`
121 |     `'python/ray/core/src/ray/gcs/*|'`
122 |     `'python/ray/thirdparty_files/*|'`
123 |     `'python/ray/_private/thirdparty/*|'`
124 |     `'python/ray/serve/tests/test_config_files/syntax_error\.py'
125 | )
126 | 
127 | GIT_LS_EXCLUDES=(
128 |   ':(exclude)python/ray/cloudpickle/'
129 |   ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py'
130 | )
131 | 
132 | # TODO(barakmich): This should be cleaned up. I've at least excised the copies
133 | # of these arguments to this location, but the long-term answer is to actually
134 | # make a flake8 config file
135 | FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605"
136 | 
137 | shellcheck_scripts() {
138 |   shellcheck "${SHELLCHECK_FLAGS[@]}" "$@"
139 | }
140 | 
141 | # Format specified files
142 | format_files() {
143 |     local shell_files=() python_files=() bazel_files=()
144 | 
145 |     local name
146 |     for name in "$@"; do
147 |       local base="${name%.*}"
148 |       local suffix="${name#"${base}"}"
149 | 
150 |       local shebang=""
151 |       read -r shebang < "${name}" || true
152 |       case "${shebang}" in
153 |         '#!'*)
154 |           shebang="${shebang#/usr/bin/env }"
155 |           shebang="${shebang%% *}"
156 |           shebang="${shebang##*/}"
157 |           ;;
158 |       esac
159 | 
160 |       if [ "${base}" = "WORKSPACE" ] || [ "${base}" = "BUILD" ] || [ "${suffix}" = ".BUILD" ] || [ "${suffix}" = ".bazel" ] || [ "${suffix}" = ".bzl" ]; then
161 |         bazel_files+=("${name}")
162 |       elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang#python}" ] || [ "${suffix}" != "${suffix#.py}" ]; then
163 |         python_files+=("${name}")
164 |       elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang%sh}" ] || [ "${suffix}" != "${suffix#.sh}" ]; then
165 |         shell_files+=("${name}")
166 |       else
167 |         echo "error: failed to determine file type: ${name}" 1>&2
168 |         return 1
169 |       fi
170 |     done
171 | 
172 |     if [ 0 -lt "${#python_files[@]}" ]; then
173 |       isort "${python_files[@]}"
174 |       black "${python_files[@]}"
175 |     fi
176 | 
177 |     if command -v shellcheck >/dev/null; then
178 |       if shellcheck --shell=sh --format=diff - < /dev/null; then
179 |         if [ 0 -lt "${#shell_files[@]}" ]; then
180 |           local difference
181 |           difference="$(shellcheck_scripts --format=diff "${shell_files[@]}" || true && printf "-")"
182 |           difference="${difference%-}"
183 |           printf "%s" "${difference}" | patch -p1
184 |         fi
185 |       else
186 |         echo "error: this version of shellcheck does not support diffs"
187 |       fi
188 |     fi
189 | }
190 | 
191 | format_all_scripts() {
192 |     command -v flake8 &> /dev/null;
193 |     HAS_FLAKE8=$?
194 | 
195 |     # Run isort before black to fix imports and let black deal with file format.
196 |     echo "$(date)" "isort...."
197 |     git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
198 |       isort
199 |     echo "$(date)" "Black...."
200 |     git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
201 |       black "${BLACK_EXCLUDES[@]}"
202 |     if [ $HAS_FLAKE8 ]; then
203 |       echo "$(date)" "Flake8...."
204 |       git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \
205 |         flake8 --config=.flake8
206 |     fi
207 | 
208 |     if command -v shellcheck >/dev/null; then
209 |       local shell_files non_shell_files
210 |       non_shell_files=($(git ls-files -- ':(exclude)*.sh'))
211 |       shell_files=($(git ls-files -- '*.sh'))
212 |       if [ 0 -lt "${#non_shell_files[@]}" ]; then
213 |         shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true))
214 |       fi
215 |       if [ 0 -lt "${#shell_files[@]}" ]; then
216 |         echo "$(date)" "shellcheck scripts...."
217 |         shellcheck_scripts "${shell_files[@]}"
218 |       fi
219 |     fi
220 | }
221 | 
222 | # Format files that differ from main branch. Ignores dirs that are not slated
223 | # for autoformat yet.
224 | format_changed() {
225 |     # The `if` guard ensures that the list of filenames is not empty, which
226 |     # could cause the formatter to receive 0 positional arguments, making
227 |     # Black error.
228 |     #
229 |     # `diff-filter=ACRM` and $MERGEBASE is to ensure we only format files that
230 |     # exist on both branches.
231 |     MERGEBASE="$(git merge-base upstream/master HEAD)"
232 | 
233 |     if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then
234 |         git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
235 |             isort
236 |     fi
237 | 
238 |     if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then
239 |         git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
240 |             black "${BLACK_EXCLUDES[@]}"
241 |         if which flake8 >/dev/null; then
242 |             git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
243 |                  flake8 --config=.flake8
244 |         fi
245 |     fi
246 | 
247 |     if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then
248 |         if which flake8 >/dev/null; then
249 |             git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \
250 |                  flake8 --config=.flake8 "$FLAKE8_PYX_IGNORES"
251 |         fi
252 |     fi
253 | 
254 |     if which clang-format >/dev/null; then
255 |         if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.cc' '*.h' &>/dev/null; then
256 |             git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.cc' '*.h' | xargs -P 5 \
257 |                  clang-format -i
258 |         fi
259 |     fi
260 | 
261 |     if command -v shellcheck >/dev/null; then
262 |         local shell_files non_shell_files
263 |         non_shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- ':(exclude)*.sh'))
264 |         shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.sh'))
265 |         if [ 0 -lt "${#non_shell_files[@]}" ]; then
266 |             shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true))
267 |         fi
268 |         if [ 0 -lt "${#shell_files[@]}" ]; then
269 |             shellcheck_scripts "${shell_files[@]}"
270 |         fi
271 |     fi
272 | }
273 | 
274 | # This flag formats individual files. --files *must* be the first command line
275 | # arg to use this option.
276 | if [ "${1-}" == '--files' ]; then
277 |     format_files "${@:2}"
278 | # If `--all` or `--scripts` are passed, then any further arguments are ignored.
279 | # Format the entire python directory and other scripts.
280 | elif [ "${1-}" == '--all-scripts' ]; then
281 |     format_all_scripts "${@}"
282 |     if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi
283 | # Format the all Python, C++, Java and other script files.
284 | elif [ "${1-}" == '--all' ]; then
285 |     format_all_scripts "${@}"
286 |     if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi
287 | else
288 |     # Add the upstream remote if it doesn't exist
289 |     if ! git remote -v | grep -q upstream; then
290 |         git remote add 'upstream' 'https://github.com/ray-project/xgboost_ray.git'
291 |     fi
292 | 
293 |     # Only fetch master since that's the branch we're diffing against.
294 |     git fetch upstream master || true
295 | 
296 |     # Format only the files that changed in last commit.
297 |     format_changed
298 | fi
299 | 
300 | check_docstyle
301 | 
302 | if ! git diff --quiet &>/dev/null; then
303 |     echo 'Reformatted changed files. Please review and stage the changes.'
304 |     echo 'Files updated:'
305 |     echo
306 | 
307 |     git --no-pager diff --name-only
308 | 
309 |     exit 1
310 | fi
311 | 


--------------------------------------------------------------------------------
/requirements/lint-requirements.txt:
--------------------------------------------------------------------------------
1 | flake8==3.9.1
2 | flake8-comprehensions==3.10.1
3 | flake8-quotes==2.0.0
4 | flake8-bugbear==21.9.2
5 | black==22.10.0
6 | isort==5.10.1
7 | importlib-metadata==4.13.0
8 | 


--------------------------------------------------------------------------------
/requirements/test-requirements.txt:
--------------------------------------------------------------------------------
 1 | packaging
 2 | petastorm
 3 | pytest
 4 | pyarrow<15.0.0
 5 | ray[tune, data, default]
 6 | scikit-learn
 7 | # modin==0.23.1.post0 is not compatible with xgboost_ray py38
 8 | modin<=0.23.1; python_version == '3.8'
 9 | # modin==0.26.0 is not compatible with xgboost_ray py39+
10 | modin<0.26.0; python_version > '3.8'
11 | dask
12 | 
13 | #workaround for now
14 | protobuf<4.0.0
15 | tensorboardX==2.2
16 | 


--------------------------------------------------------------------------------
/run_ci_examples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | TUNE=1
 5 | 
 6 | for i in "$@"
 7 | do
 8 | echo "$i"
 9 | case "$i" in
10 |     --no-tune)
11 |     TUNE=0
12 |     ;;
13 |     *)
14 |     echo "unknown arg, $i"
15 |     exit 1
16 |     ;;
17 | esac
18 | done
19 | 
20 | pushd xgboost_ray/examples/ || exit 1
21 | ray stop || true
22 | echo "================"
23 | echo "Running examples"
24 | echo "================"
25 | echo "running readme.py" && python readme.py
26 | echo "running readme_sklearn_api.py" && python readme_sklearn_api.py
27 | echo "running simple.py" && python simple.py --smoke-test
28 | echo "running simple_predict.py" && python simple_predict.py
29 | echo "running simple_dask.py" && python simple_dask.py --smoke-test
30 | echo "running simple_modin.py" && python simple_modin.py --smoke-test
31 | echo "running simple_objectstore.py" && python simple_objectstore.py --smoke-test
32 | echo "running simple_ray_dataset.py" && python simple_objectstore.py --smoke-test
33 | echo "running simple_partitioned.py" && python simple_partitioned.py --smoke-test
34 | 
35 | if [ "$TUNE" = "1" ]; then
36 |   echo "running simple_tune.py" && python simple_tune.py --smoke-test
37 | else
38 |   echo "skipping tune example"
39 | fi
40 | 
41 | echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test
42 | popd
43 | 
44 | pushd xgboost_ray/tests
45 | echo "running examples with Ray Client"
46 | python -m pytest -v --durations=0 -x test_client.py
47 | popd || exit 1
48 | 


--------------------------------------------------------------------------------
/run_ci_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | TUNE=1
 3 | 
 4 | for i in "$@"
 5 | do
 6 | echo "$i"
 7 | case "$i" in
 8 |     --no-tune)
 9 |     TUNE=0
10 |     ;;
11 |     *)
12 |     echo "unknown arg, $i"
13 |     exit 1
14 |     ;;
15 | esac
16 | done
17 | 
18 | pushd xgboost_ray/tests || exit 1
19 | echo "============="
20 | echo "Running tests"
21 | echo "============="
22 | END_STATUS=0
23 | if ! python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x "test_colocation.py" ; then END_STATUS=1; fi
24 | if ! python -m pytest -v --durations=0 -x "test_matrix.py" ; then END_STATUS=1; fi
25 | if ! python -m pytest -v --durations=0 -x "test_data_source.py" ; then END_STATUS=1; fi
26 | if ! python -m pytest -v --durations=0 -x "test_xgboost_api.py" ; then END_STATUS=1; fi
27 | if ! python -m pytest -v --durations=0 -x "test_fault_tolerance.py" ; then END_STATUS=1; fi
28 | if ! python -m pytest -v --durations=0 -x "test_end_to_end.py" ; then END_STATUS=1; fi
29 | if ! python -m pytest -v --durations=0 -x "test_sklearn.py" ; then END_STATUS=1; fi
30 | if ! python -m pytest -v --durations=0 -x "test_sklearn_matrix.py" ; then END_STATUS=1; fi
31 | 
32 | if [ "$TUNE" = "1" ]; then
33 |   if ! python -m pytest -v --durations=0 -x "test_tune.py" ; then END_STATUS=1; fi
34 | else
35 |   echo "skipping tune tests"
36 | fi
37 | 
38 | echo "running smoke test on benchmark_cpu_gpu.py" && if ! python release/benchmark_cpu_gpu.py 2 10 20 --smoke-test; then END_STATUS=1; fi
39 | popd || exit 1
40 | 
41 | if [ "$END_STATUS" = "1" ]; then
42 |   echo "At least one test has failed, exiting with code 1"
43 | fi
44 | exit "$END_STATUS"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="xgboost_ray",
 5 |     packages=find_packages(where=".", include="xgboost_ray*"),
 6 |     version="0.1.20",
 7 |     author="Ray Team",
 8 |     description="A Ray backend for distributed XGBoost",
 9 |     license="Apache 2.0",
10 |     long_description="A distributed backend for XGBoost built on top of "
11 |     "distributed computing framework Ray.",
12 |     url="https://github.com/ray-project/xgboost_ray",
13 |     install_requires=[
14 |         "ray>=2.7",
15 |         "numpy>=1.16",
16 |         "pandas",
17 |         "wrapt>=1.12.1",
18 |         "xgboost>=0.90",
19 |         "packaging",
20 |     ],
21 | )
22 | 


--------------------------------------------------------------------------------
/xgboost_ray/__init__.py:
--------------------------------------------------------------------------------
 1 | from xgboost_ray.main import RayParams, predict, train
 2 | from xgboost_ray.matrix import (
 3 |     Data,
 4 |     RayDeviceQuantileDMatrix,
 5 |     RayDMatrix,
 6 |     RayFileType,
 7 |     RayShardingMode,
 8 |     combine_data,
 9 | )
10 | 
11 | # workaround for legacy xgboost==0.9.0
12 | try:
13 |     from xgboost_ray.sklearn import (
14 |         RayXGBClassifier,
15 |         RayXGBRanker,
16 |         RayXGBRegressor,
17 |         RayXGBRFClassifier,
18 |         RayXGBRFRegressor,
19 |     )
20 | except ImportError:
21 |     pass
22 | 
23 | __version__ = "0.1.20"
24 | 
25 | __all__ = [
26 |     "__version__",
27 |     "RayParams",
28 |     "RayDMatrix",
29 |     "RayDeviceQuantileDMatrix",
30 |     "RayFileType",
31 |     "RayShardingMode",
32 |     "Data",
33 |     "combine_data",
34 |     "train",
35 |     "predict",
36 |     "RayXGBClassifier",
37 |     "RayXGBRegressor",
38 |     "RayXGBRFClassifier",
39 |     "RayXGBRFRegressor",
40 |     "RayXGBRanker",
41 | ]
42 | 


--------------------------------------------------------------------------------
/xgboost_ray/callback.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABC
  3 | from typing import TYPE_CHECKING, Any, Dict, Sequence, Union
  4 | 
  5 | import pandas as pd
  6 | from ray.util.annotations import DeveloperAPI, PublicAPI
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from xgboost_ray.main import RayXGBoostActor
 10 |     from xgboost_ray.matrix import RayDMatrix
 11 | 
 12 | 
 13 | @PublicAPI(stability="beta")
 14 | class DistributedCallback(ABC):
 15 |     """Distributed callbacks for RayXGBoostActors.
 16 | 
 17 |     The hooks of these callbacks are executed on the remote Ray actors
 18 |     at different points in time. They can be used to set environment
 19 |     variables or to prepare the training/prediction environment in other
 20 |     ways. Distributed callback objects are de-serialized on each actor
 21 |     and are then independent of each other - changing the state of one
 22 |     callback will not alter the state of the other copies on different actors.
 23 | 
 24 |     Callbacks can be passed to xgboost_ray via
 25 |     :class:`RayParams <xgboost_ray.main.RayParams>` using the
 26 |     ``distributed_callbacks`` parameter.
 27 |     """
 28 | 
 29 |     def on_init(self, actor: "RayXGBoostActor", *args, **kwargs):
 30 |         pass
 31 | 
 32 |     def before_data_loading(
 33 |         self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
 34 |     ):
 35 |         pass
 36 | 
 37 |     def after_data_loading(
 38 |         self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
 39 |     ):
 40 |         pass
 41 | 
 42 |     def before_train(self, actor: "RayXGBoostActor", *args, **kwargs):
 43 |         pass
 44 | 
 45 |     def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs):
 46 |         pass
 47 | 
 48 |     def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs):
 49 |         pass
 50 | 
 51 |     def after_predict(
 52 |         self,
 53 |         actor: "RayXGBoostActor",
 54 |         predictions: Union[pd.Series, pd.DataFrame],
 55 |         *args,
 56 |         **kwargs
 57 |     ):
 58 |         pass
 59 | 
 60 | 
 61 | @DeveloperAPI
 62 | class DistributedCallbackContainer:
 63 |     def __init__(self, callbacks: Sequence[DistributedCallback]):
 64 |         self.callbacks = callbacks or []
 65 | 
 66 |     def on_init(self, actor: "RayXGBoostActor", *args, **kwargs):
 67 |         for callback in self.callbacks:
 68 |             callback.on_init(actor, *args, **kwargs)
 69 | 
 70 |     def before_data_loading(
 71 |         self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
 72 |     ):
 73 |         for callback in self.callbacks:
 74 |             callback.before_data_loading(actor, data, *args, **kwargs)
 75 | 
 76 |     def after_data_loading(
 77 |         self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs
 78 |     ):
 79 |         for callback in self.callbacks:
 80 |             callback.after_data_loading(actor, data, *args, **kwargs)
 81 | 
 82 |     def before_train(self, actor: "RayXGBoostActor", *args, **kwargs):
 83 |         for callback in self.callbacks:
 84 |             callback.before_train(actor, *args, **kwargs)
 85 | 
 86 |     def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs):
 87 |         for callback in self.callbacks:
 88 |             callback.after_train(actor, result_dict, *args, **kwargs)
 89 | 
 90 |     def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs):
 91 |         for callback in self.callbacks:
 92 |             callback.before_predict(actor, *args, **kwargs)
 93 | 
 94 |     def after_predict(
 95 |         self,
 96 |         actor: "RayXGBoostActor",
 97 |         predictions: Union[pd.Series, pd.DataFrame],
 98 |         *args,
 99 |         **kwargs
100 |     ):
101 |         for callback in self.callbacks:
102 |             callback.after_predict(actor, predictions, *args, **kwargs)
103 | 
104 | 
105 | class EnvironmentCallback(DistributedCallback):
106 |     def __init__(self, env_dict: Dict[str, Any]):
107 |         self.env_dict = env_dict
108 | 
109 |     def on_init(self, actor, *args, **kwargs):
110 |         os.environ.update(self.env_dict)
111 | 


--------------------------------------------------------------------------------
/xgboost_ray/compat/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | if TYPE_CHECKING:
 4 |     from xgboost_ray.xgb import xgboost as xgb
 5 | 
 6 | try:
 7 |     from xgboost.callback import TrainingCallback
 8 | 
 9 |     LEGACY_CALLBACK = False
10 | except ImportError:
11 | 
12 |     class TrainingCallback:
13 |         def __init__(self):
14 |             if hasattr(self, "before_iteration"):
15 |                 # XGBoost < 1.0 is looking up __dict__ to see if a
16 |                 # callback should be called before or after an iteration.
17 |                 # So here we move this to self._before_iteration and
18 |                 # overwrite the dict.
19 |                 self._before_iteration = getattr(self, "before_iteration")
20 |                 self.__dict__["before_iteration"] = True
21 | 
22 |         def __call__(self, callback_env: "xgb.core.CallbackEnv"):
23 |             if hasattr(self, "_before_iteration"):
24 |                 self._before_iteration(
25 |                     model=callback_env.model,
26 |                     epoch=callback_env.iteration,
27 |                     evals_log=callback_env.evaluation_result_list,
28 |                 )
29 | 
30 |             if hasattr(self, "after_iteration"):
31 |                 self.after_iteration(
32 |                     model=callback_env.model,
33 |                     epoch=callback_env.iteration,
34 |                     evals_log=callback_env.evaluation_result_list,
35 |                 )
36 | 
37 |         def before_training(self, model):
38 |             pass
39 | 
40 |         def after_training(self, model):
41 |             pass
42 | 
43 |     LEGACY_CALLBACK = True
44 | 
45 | try:
46 |     from xgboost import RabitTracker
47 | except ImportError:
48 |     from xgboost_ray.compat.tracker import RabitTracker
49 | 
50 | __all__ = ["TrainingCallback", "RabitTracker"]
51 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/__init__.py:
--------------------------------------------------------------------------------
 1 | from xgboost_ray.data_sources.csv import CSV
 2 | from xgboost_ray.data_sources.dask import Dask
 3 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 4 | from xgboost_ray.data_sources.modin import Modin
 5 | from xgboost_ray.data_sources.numpy import Numpy
 6 | from xgboost_ray.data_sources.object_store import ObjectStore
 7 | from xgboost_ray.data_sources.pandas import Pandas
 8 | from xgboost_ray.data_sources.parquet import Parquet
 9 | from xgboost_ray.data_sources.partitioned import Partitioned
10 | from xgboost_ray.data_sources.petastorm import Petastorm
11 | from xgboost_ray.data_sources.ray_dataset import RayDataset
12 | 
13 | data_sources = [
14 |     Numpy,
15 |     Pandas,
16 |     Partitioned,
17 |     Modin,
18 |     Dask,
19 |     Petastorm,
20 |     CSV,
21 |     Parquet,
22 |     ObjectStore,
23 |     RayDataset,
24 | ]
25 | 
26 | __all__ = [
27 |     "DataSource",
28 |     "RayFileType",
29 |     "Numpy",
30 |     "Pandas",
31 |     "Modin",
32 |     "Dask",
33 |     "Petastorm",
34 |     "CSV",
35 |     "Parquet",
36 |     "ObjectStore",
37 |     "RayDataset",
38 |     "Partitioned",
39 | ]
40 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/_distributed.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import math
  3 | from collections import defaultdict
  4 | from typing import Any, Dict, Sequence
  5 | 
  6 | import ray
  7 | from ray.actor import ActorHandle
  8 | 
  9 | 
 10 | def get_actor_rank_ips(actors: Sequence[ActorHandle]) -> Dict[int, str]:
 11 |     """Get a dict mapping from actor ranks to their IPs"""
 12 |     no_obj = ray.put(None)
 13 |     # Build a dict mapping actor ranks to their IP addresses
 14 |     actor_rank_ips: Dict[int, str] = dict(
 15 |         enumerate(
 16 |             ray.get(
 17 |                 [actor.ip.remote() if actor is not None else no_obj for actor in actors]
 18 |             )
 19 |         )
 20 |     )
 21 |     return actor_rank_ips
 22 | 
 23 | 
 24 | def assign_partitions_to_actors(
 25 |     ip_to_parts: Dict[int, Any], actor_rank_ips: Dict[int, str]
 26 | ) -> Dict[int, Sequence[Any]]:
 27 |     """Assign partitions from a distributed dataframe to actors.
 28 | 
 29 |     This function collects distributed partitions and evenly distributes
 30 |     them to actors, trying to minimize data transfer by respecting
 31 |     co-locality.
 32 | 
 33 |     This function currently does _not_ take partition sizes into account
 34 |     for distributing data. It assumes that all partitions have (more or less)
 35 |     the same length.
 36 | 
 37 |     Instead, partitions are evenly distributed. E.g. for 8 partitions and 3
 38 |     actors, each actor gets assigned 2 or 3 partitions. Which partitions are
 39 |     assigned depends on the data locality.
 40 | 
 41 |     The algorithm is as follows: For any number of data partitions, get the
 42 |     Ray object references to the shards and the IP addresses where they
 43 |     currently live.
 44 | 
 45 |     Calculate the minimum and maximum amount of partitions per actor. These
 46 |     numbers should differ by at most 1. Also calculate how many actors will
 47 |     get more partitions assigned than the other actors.
 48 | 
 49 |     First, each actor gets assigned up to ``max_parts_per_actor`` co-located
 50 |     partitions. Only up to ``num_actors_with_max_parts`` actors get the
 51 |     maximum number of partitions, the rest try to fill the minimum.
 52 | 
 53 |     The rest of the partitions (all of which cannot be assigned to a
 54 |     co-located actor) are assigned to actors until there are none left.
 55 |     """
 56 |     num_partitions = sum(len(parts) for parts in ip_to_parts.values())
 57 |     num_actors = len(actor_rank_ips)
 58 |     min_parts_per_actor = max(0, math.floor(num_partitions / num_actors))
 59 |     max_parts_per_actor = max(1, math.ceil(num_partitions / num_actors))
 60 |     num_actors_with_max_parts = num_partitions % num_actors
 61 | 
 62 |     # This is our result dict that maps actor objects to a list of partitions
 63 |     actor_to_partitions = defaultdict(list)
 64 | 
 65 |     # First we loop through the actors and assign them partitions from their
 66 |     # own IPs. Do this until each actor has `min_parts_per_actor` partitions
 67 |     partition_assigned = True
 68 |     while partition_assigned:
 69 |         partition_assigned = False
 70 | 
 71 |         # Loop through each actor once, assigning
 72 |         for rank, actor_ip in actor_rank_ips.items():
 73 |             num_parts_left_on_ip = len(ip_to_parts[actor_ip])
 74 |             num_actor_parts = len(actor_to_partitions[rank])
 75 | 
 76 |             if num_parts_left_on_ip > 0 and num_actor_parts < max_parts_per_actor:
 77 |                 if num_actor_parts >= min_parts_per_actor:
 78 |                     # Only allow up to `num_actors_with_max_parts actors to
 79 |                     # have the maximum number of partitions assigned.
 80 |                     if num_actors_with_max_parts <= 0:
 81 |                         continue
 82 |                     num_actors_with_max_parts -= 1
 83 |                 actor_to_partitions[rank].append(ip_to_parts[actor_ip].pop(0))
 84 |                 partition_assigned = True
 85 | 
 86 |     # The rest of the partitions, no matter where they are located, could not
 87 |     # be assigned to co-located actors. Thus, we assign them
 88 |     # to actors who still need partitions.
 89 |     rest_parts = list(itertools.chain(*ip_to_parts.values()))
 90 |     partition_assigned = True
 91 |     while len(rest_parts) > 0 and partition_assigned:
 92 |         partition_assigned = False
 93 |         for rank in actor_rank_ips:
 94 |             num_actor_parts = len(actor_to_partitions[rank])
 95 |             if num_actor_parts < max_parts_per_actor:
 96 |                 if num_actor_parts >= min_parts_per_actor:
 97 |                     if num_actors_with_max_parts <= 0:
 98 |                         continue
 99 |                     num_actors_with_max_parts -= 1
100 |                 actor_to_partitions[rank].append(rest_parts.pop(0))
101 |                 partition_assigned = True
102 |             if len(rest_parts) <= 0:
103 |                 break
104 | 
105 |     if len(rest_parts) != 0:
106 |         raise RuntimeError(
107 |             "There are still partitions left to assign, but no actor "
108 |             "has capacity for more. This is probably a bug. Please go "
109 |             "to https://github.com/ray-project/xgboost_ray to report it."
110 |         )
111 | 
112 |     return actor_to_partitions
113 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/csv.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Iterable, Optional, Sequence, Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 6 | from xgboost_ray.data_sources.pandas import Pandas
 7 | 
 8 | 
 9 | class CSV(DataSource):
10 |     """Read one or many CSV files."""
11 | 
12 |     supports_central_loading = True
13 |     supports_distributed_loading = True
14 | 
15 |     @staticmethod
16 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
17 |         return filetype == RayFileType.CSV
18 | 
19 |     @staticmethod
20 |     def get_filetype(data: Any) -> Optional[RayFileType]:
21 |         if data.endswith(".csv") or data.endswith("csv.gz"):
22 |             return RayFileType.CSV
23 |         return None
24 | 
25 |     @staticmethod
26 |     def load_data(
27 |         data: Union[str, Sequence[str]],
28 |         ignore: Optional[Sequence[str]] = None,
29 |         indices: Optional[Sequence[int]] = None,
30 |         **kwargs
31 |     ):
32 |         if isinstance(data, Iterable) and not isinstance(data, str):
33 |             shards = []
34 | 
35 |             for i, shard in enumerate(data):
36 |                 if indices and i not in indices:
37 |                     continue
38 |                 shard_df = pd.read_csv(shard, **kwargs)
39 |                 shards.append(Pandas.load_data(shard_df, ignore=ignore))
40 |             return pd.concat(shards, copy=False)
41 |         else:
42 |             local_df = pd.read_csv(data, **kwargs)
43 |             return Pandas.load_data(local_df, ignore=ignore)
44 | 
45 |     @staticmethod
46 |     def get_n(data: Any):
47 |         return len(list(data))
48 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/dask.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
  3 | 
  4 | import pandas as pd
  5 | import ray
  6 | import wrapt
  7 | from ray.actor import ActorHandle
  8 | 
  9 | from xgboost_ray.data_sources._distributed import (
 10 |     assign_partitions_to_actors,
 11 |     get_actor_rank_ips,
 12 | )
 13 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 14 | 
 15 | try:
 16 |     import dask  # noqa: F401
 17 |     from ray.util.dask import ray_dask_get
 18 | 
 19 |     DASK_INSTALLED = True
 20 | except ImportError:
 21 |     DASK_INSTALLED = False
 22 | 
 23 | 
 24 | def _assert_dask_installed():
 25 |     if not DASK_INSTALLED:
 26 |         raise RuntimeError(
 27 |             "Tried to use Dask as a data source, but dask is not "
 28 |             "installed. This function shouldn't have been called. "
 29 |             "\nFIX THIS by installing dask: `pip install dask`. "
 30 |             "\nPlease also raise an issue on our GitHub: "
 31 |             "https://github.com/ray-project/xgboost_ray as this part of "
 32 |             "the code should not have been reached."
 33 |         )
 34 | 
 35 | 
 36 | @wrapt.decorator
 37 | def ensure_ray_dask_initialized(
 38 |     func: Any, instance: Any, args: List[Any], kwargs: Any
 39 | ) -> Any:
 40 |     _assert_dask_installed()
 41 |     dask.config.set(scheduler=ray_dask_get)
 42 |     return func(*args, **kwargs)
 43 | 
 44 | 
 45 | class Dask(DataSource):
 46 |     """Read from distributed Dask dataframe.
 47 | 
 48 |     A `Dask dataframe <https://docs.dask.org/en/latest/dataframe.html>`_
 49 |     is a distributed drop-in replacement for pandas.
 50 | 
 51 |     Dask dataframes are stored on multiple actors, making them
 52 |     suitable for distributed loading.
 53 |     """
 54 | 
 55 |     supports_central_loading = True
 56 |     supports_distributed_loading = True
 57 | 
 58 |     @staticmethod
 59 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
 60 |         if not DASK_INSTALLED:
 61 |             return False
 62 |         from dask.dataframe import DataFrame as DaskDataFrame
 63 |         from dask.dataframe import Series as DaskSeries
 64 | 
 65 |         return isinstance(data, (DaskDataFrame, DaskSeries))
 66 | 
 67 |     @ensure_ray_dask_initialized
 68 |     @staticmethod
 69 |     def load_data(
 70 |         data: Any,  # dask.pandas.DataFrame
 71 |         ignore: Optional[Sequence[str]] = None,
 72 |         indices: Optional[Union[Sequence[int], Sequence[int]]] = None,
 73 |         **kwargs
 74 |     ) -> pd.DataFrame:
 75 |         _assert_dask_installed()
 76 | 
 77 |         import dask.dataframe as dd
 78 | 
 79 |         if indices is not None and len(indices) > 0 and isinstance(indices[0], Tuple):
 80 |             # We got a list of partition IDs belonging to Dask partitions
 81 |             return dd.concat([data.partitions[i] for (i,) in indices]).compute()
 82 | 
 83 |         # Dask does not support iloc() for row selection, so we have to
 84 |         # compute a local pandas dataframe first
 85 |         local_df = data.compute()
 86 | 
 87 |         if indices:
 88 |             local_df = local_df.iloc[indices]
 89 | 
 90 |         if ignore:
 91 |             local_df = local_df[local_df.columns.difference(ignore)]
 92 | 
 93 |         return local_df
 94 | 
 95 |     @ensure_ray_dask_initialized
 96 |     @staticmethod
 97 |     def convert_to_series(data: Any) -> pd.Series:
 98 |         _assert_dask_installed()
 99 |         from dask.array import Array as DaskArray
100 |         from dask.dataframe import DataFrame as DaskDataFrame
101 |         from dask.dataframe import Series as DaskSeries
102 | 
103 |         if isinstance(data, DaskDataFrame):
104 |             return pd.Series(data.compute().squeeze())
105 |         elif isinstance(data, DaskSeries):
106 |             return data.compute()
107 |         elif isinstance(data, DaskArray):
108 |             return pd.Series(data.compute())
109 | 
110 |         return DataSource.convert_to_series(data)
111 | 
112 |     @ensure_ray_dask_initialized
113 |     @staticmethod
114 |     def get_actor_shards(
115 |         data: Any, actors: Sequence[ActorHandle]  # dask.dataframe.DataFrame
116 |     ) -> Tuple[Any, Optional[Dict[int, Any]]]:
117 |         _assert_dask_installed()
118 | 
119 |         actor_rank_ips = get_actor_rank_ips(actors)
120 | 
121 |         # Get IPs and partitions
122 |         ip_to_parts = get_ip_to_parts(data)
123 | 
124 |         return data, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)
125 | 
126 |     @ensure_ray_dask_initialized
127 |     @staticmethod
128 |     def get_n(data: Any):
129 |         """
130 |         For naive distributed loading we just return the number of rows
131 |         here. Loading by shard is achieved via `get_actor_shards()`
132 |         """
133 |         return len(data)
134 | 
135 | 
136 | def get_ip_to_parts(data: Any) -> Dict[int, Sequence[Any]]:
137 |     persisted = data.persist(scheduler=ray_dask_get)
138 |     name = persisted._name
139 | 
140 |     node_ids_to_node = {node["NodeID"]: node for node in ray.state.nodes()}
141 | 
142 |     # This is a hacky way to get the partition node IDs, and it's not
143 |     # 100% accurate as the map task could get scheduled on a different node
144 |     # (though Ray tries to keep locality). We need to use that until
145 |     # ray.state.objects() or something like it is available again.
146 |     partition_locations_df = persisted.map_partitions(
147 |         lambda df: pd.DataFrame([ray.get_runtime_context().get_node_id()])
148 |     ).compute()
149 |     partition_locations = [
150 |         partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size)
151 |     ]
152 | 
153 |     ip_to_parts = defaultdict(list)
154 |     for (obj_name, pid), obj_ref in dask.base.collections_to_dsk([persisted]).items():
155 |         assert obj_name == name
156 | 
157 |         if isinstance(obj_ref, ray.ObjectRef):
158 |             node_id = partition_locations[pid]
159 |             node = node_ids_to_node.get(node_id, {})
160 |             ip = node.get("NodeManagerAddress", "_no_ip")
161 |         else:
162 |             ip = "_no_ip"
163 | 
164 |         # Pass tuples here (integers can be misinterpreted as row numbers)
165 |         ip_to_parts[ip].append((pid,))
166 | 
167 |     return ip_to_parts
168 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/data_source.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
  3 | 
  4 | import pandas as pd
  5 | from ray.actor import ActorHandle
  6 | from ray.util.annotations import PublicAPI
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from xgboost_ray.xgb import xgboost as xgb
 10 | 
 11 | 
 12 | @PublicAPI(stability="beta")
 13 | class RayFileType(Enum):
 14 |     """Enum for different file types (used for overrides)."""
 15 | 
 16 |     CSV = 1
 17 |     PARQUET = 2
 18 |     PETASTORM = 3
 19 | 
 20 | 
 21 | @PublicAPI(stability="beta")
 22 | class DataSource:
 23 |     """Abstract class for data sources.
 24 | 
 25 |     xgboost_ray supports reading from various sources, such as files
 26 |     (e.g. CSV, Parquet) or distributed datasets (Modin).
 27 | 
 28 |     This abstract class defines an interface to read from these sources.
 29 |     New data sources can be added by implementing this interface.
 30 | 
 31 |     ``DataSource`` classes are not instantiated. Instead, static and
 32 |     class methods are called directly.
 33 |     """
 34 | 
 35 |     supports_central_loading = True
 36 |     supports_distributed_loading = False
 37 |     needs_partitions = True
 38 | 
 39 |     @staticmethod
 40 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
 41 |         """Check if the supplied data matches this data source.
 42 | 
 43 |         Args:
 44 |             data: Dataset.
 45 |             filetype: RayFileType of the provided
 46 |                 dataset. Some DataSource implementations might require
 47 |                 that this is explicitly set (e.g. if multiple sources can
 48 |                 read CSV files).
 49 | 
 50 |         Returns:
 51 |             Boolean indicating if this data source belongs to/is compatible
 52 |                 with the data.
 53 |         """
 54 |         return False
 55 | 
 56 |     @staticmethod
 57 |     def get_filetype(data: Any) -> Optional[RayFileType]:
 58 |         """Method to help infer the filetype.
 59 | 
 60 |         Returns None if the supplied data type (usually a filename)
 61 |         is not covered by this data source, otherwise the filetype
 62 |         is returned.
 63 | 
 64 |         Args:
 65 |             data: Data set
 66 | 
 67 |         Returns:
 68 |             RayFileType or None.
 69 |         """
 70 |         return None
 71 | 
 72 |     @staticmethod
 73 |     def load_data(
 74 |         data: Any,
 75 |         ignore: Optional[Sequence[str]] = None,
 76 |         indices: Optional[Sequence[Any]] = None,
 77 |         **kwargs
 78 |     ) -> pd.DataFrame:
 79 |         """
 80 |         Load data into a pandas dataframe.
 81 | 
 82 |         Ignore specific columns, and optionally select specific indices.
 83 | 
 84 |         Args:
 85 |             data: Input data
 86 |             ignore: Column names to ignore
 87 |             indices: Indices to select. What an
 88 |                 index indicates depends on the data source.
 89 | 
 90 |         Returns:
 91 |             Pandas DataFrame.
 92 |         """
 93 |         raise NotImplementedError
 94 | 
 95 |     @staticmethod
 96 |     def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]):
 97 |         """Optionally update feature names before training/prediction
 98 | 
 99 |         Args:
100 |             matrix: xgboost DMatrix object.
101 |             feature_names: Feature names manually passed to the
102 |                 ``RayDMatrix`` object.
103 | 
104 |         """
105 |         pass
106 | 
107 |     @staticmethod
108 |     def convert_to_series(data: Any) -> pd.Series:
109 |         """Convert data from the data source type to a pandas series"""
110 |         if isinstance(data, pd.DataFrame):
111 |             return pd.Series(data.squeeze())
112 | 
113 |         if not isinstance(data, pd.Series):
114 |             return pd.Series(data)
115 | 
116 |         return data
117 | 
118 |     @classmethod
119 |     def get_column(
120 |         cls, data: pd.DataFrame, column: Any
121 |     ) -> Tuple[pd.Series, Optional[Union[str, List]]]:
122 |         """Helper method wrapping around convert to series.
123 | 
124 |         This method should usually not be overwritten.
125 |         """
126 |         if isinstance(column, str) or isinstance(column, List):
127 |             return data[column], column
128 |         elif column is not None:
129 |             return cls.convert_to_series(column), None
130 |         return column, None
131 | 
132 |     @staticmethod
133 |     def get_n(data: Any):
134 |         """Get length of data source partitions for sharding."""
135 |         return len(data)
136 | 
137 |     @staticmethod
138 |     def get_actor_shards(
139 |         data: Any, actors: Sequence[ActorHandle]
140 |     ) -> Tuple[Any, Optional[Dict[int, Any]]]:
141 |         """Get a dict mapping actor ranks to shards.
142 | 
143 |         Args:
144 |             data: Data to shard.
145 | 
146 |         Returns:
147 |             Returns a tuple of which the first element indicates the new
148 |                 data object that will overwrite the existing data object
149 |                 in the RayDMatrix (e.g. when the object is not serializable).
150 |                 The second element is a dict mapping actor ranks to shards.
151 |                 These objects are usually passed to the ``load_data()`` method
152 |                 for distributed loading, so that method needs to be able to
153 |                 deal with the respective data.
154 |         """
155 |         return data, None
156 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/modin.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Any, Dict, Optional, Sequence, Tuple, Union
  3 | 
  4 | import pandas as pd
  5 | import ray
  6 | from ray import ObjectRef
  7 | from ray.actor import ActorHandle
  8 | 
  9 | from xgboost_ray.data_sources._distributed import (
 10 |     assign_partitions_to_actors,
 11 |     get_actor_rank_ips,
 12 | )
 13 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 14 | from xgboost_ray.data_sources.object_store import ObjectStore
 15 | 
 16 | try:
 17 |     import modin  # noqa: F401
 18 |     from modin.config.envvars import Engine
 19 |     from modin.distributed.dataframe.pandas import unwrap_partitions  # noqa: F401
 20 |     from modin.pandas import DataFrame as ModinDataFrame  # noqa: F401
 21 |     from modin.pandas import Series as ModinSeries  # noqa: F401
 22 |     from packaging.version import Version
 23 | 
 24 |     MODIN_INSTALLED = Version(modin.__version__) >= Version("0.9.0")
 25 | 
 26 |     # Check if importing the Ray engine leads to errors
 27 |     Engine().get()
 28 | 
 29 | except (ImportError, AttributeError):
 30 |     MODIN_INSTALLED = False
 31 | 
 32 | 
 33 | def _assert_modin_installed():
 34 |     if not MODIN_INSTALLED:
 35 |         raise RuntimeError(
 36 |             "Tried to use Modin as a data source, but modin is not "
 37 |             "installed or it conflicts with the pandas version. "
 38 |             "This function shouldn't have been called. "
 39 |             "\nFIX THIS by installing modin: `pip install modin` "
 40 |             "and making sure that the installed pandas version is "
 41 |             "supported by modin."
 42 |             "\nPlease also raise an issue on our GitHub: "
 43 |             "https://github.com/ray-project/xgboost_ray as this part of "
 44 |             "the code should not have been reached."
 45 |         )
 46 | 
 47 | 
 48 | class Modin(DataSource):
 49 |     """Read from distributed Modin dataframe.
 50 | 
 51 |     `Modin <https://github.com/modin-project/modin>`_ is a distributed
 52 |     drop-in replacement for pandas supporting Ray as a backend.
 53 | 
 54 |     Modin dataframes are stored on multiple actors, making them
 55 |     suitable for distributed loading.
 56 |     """
 57 | 
 58 |     supports_central_loading = True
 59 |     supports_distributed_loading = True
 60 | 
 61 |     @staticmethod
 62 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
 63 |         if not MODIN_INSTALLED:
 64 |             return False
 65 |         # Has to be imported again.
 66 |         from modin.pandas import DataFrame as ModinDataFrame  # noqa: F811
 67 |         from modin.pandas import Series as ModinSeries  # noqa: F811
 68 | 
 69 |         return isinstance(data, (ModinDataFrame, ModinSeries))
 70 | 
 71 |     @staticmethod
 72 |     def load_data(
 73 |         data: Any,  # modin.pandas.DataFrame
 74 |         ignore: Optional[Sequence[str]] = None,
 75 |         indices: Optional[Union[Sequence[int], Sequence[ObjectRef]]] = None,
 76 |         **kwargs
 77 |     ) -> pd.DataFrame:
 78 |         _assert_modin_installed()
 79 | 
 80 |         if (
 81 |             indices is not None
 82 |             and len(indices) > 0
 83 |             and isinstance(indices[0], ObjectRef)
 84 |         ):
 85 |             # We got a list of ObjectRefs belonging to Modin partitions
 86 |             return ObjectStore.load_data(data=indices, indices=None, ignore=ignore)
 87 | 
 88 |         local_df = data
 89 |         if indices:
 90 |             local_df = local_df.iloc[indices]
 91 | 
 92 |         local_df = local_df._to_pandas()
 93 | 
 94 |         if ignore:
 95 |             local_df = local_df[local_df.columns.difference(ignore)]
 96 | 
 97 |         return local_df
 98 | 
 99 |     @staticmethod
100 |     def convert_to_series(data: Any) -> pd.Series:
101 |         _assert_modin_installed()
102 |         # Has to be imported again.
103 |         from modin.pandas import DataFrame as ModinDataFrame  # noqa: F811
104 |         from modin.pandas import Series as ModinSeries  # noqa: F811
105 | 
106 |         if isinstance(data, ModinDataFrame):
107 |             return pd.Series(data._to_pandas().squeeze())
108 |         elif isinstance(data, ModinSeries):
109 |             return data._to_pandas()
110 | 
111 |         return DataSource.convert_to_series(data)
112 | 
113 |     @staticmethod
114 |     def get_actor_shards(
115 |         data: Any, actors: Sequence[ActorHandle]  # modin.pandas.DataFrame
116 |     ) -> Tuple[Any, Optional[Dict[int, Any]]]:
117 |         _assert_modin_installed()
118 | 
119 |         # Has to be imported again.
120 |         from modin.distributed.dataframe.pandas import unwrap_partitions  # noqa: F811
121 | 
122 |         actor_rank_ips = get_actor_rank_ips(actors)
123 | 
124 |         # Get IPs and partitions
125 |         unwrapped = unwrap_partitions(data, axis=0, get_ip=True)
126 |         ip_objs, part_objs = zip(*unwrapped)
127 | 
128 |         # Build a table mapping from IP to list of partitions
129 |         ip_to_parts = defaultdict(list)
130 |         for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs):
131 |             ip_to_parts[ip].append(part_obj)
132 | 
133 |         # Modin dataframes are not serializable, so pass None here
134 |         # as the first return value
135 |         return None, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)
136 | 
137 |     @staticmethod
138 |     def get_n(data: Any):
139 |         """
140 |         For naive distributed loading we just return the number of rows
141 |         here. Loading by shard is achieved via `get_actor_shards()`
142 |         """
143 |         return len(data)
144 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/numpy.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Any, List, Optional, Sequence
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 7 | from xgboost_ray.data_sources.pandas import Pandas
 8 | 
 9 | if TYPE_CHECKING:
10 |     from xgboost_ray.xgb import xgboost as xgb
11 | 
12 | 
13 | class Numpy(DataSource):
14 |     """Read from numpy arrays."""
15 | 
16 |     @staticmethod
17 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
18 |         return isinstance(data, np.ndarray)
19 | 
20 |     @staticmethod
21 |     def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]):
22 |         # Potentially unset feature names
23 |         matrix.feature_names = feature_names
24 | 
25 |     @staticmethod
26 |     def load_data(
27 |         data: np.ndarray,
28 |         ignore: Optional[Sequence[str]] = None,
29 |         indices: Optional[Sequence[int]] = None,
30 |         **kwargs,
31 |     ) -> pd.DataFrame:
32 |         local_df = pd.DataFrame(data, columns=[f"f{i}" for i in range(data.shape[1])])
33 |         return Pandas.load_data(local_df, ignore=ignore, indices=indices)
34 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/object_store.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional, Sequence
 2 | 
 3 | import pandas as pd
 4 | import ray
 5 | from ray import ObjectRef
 6 | 
 7 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 8 | from xgboost_ray.data_sources.pandas import Pandas
 9 | 
10 | 
11 | class ObjectStore(DataSource):
12 |     """Read pandas dataframes and series from ray object store."""
13 | 
14 |     @staticmethod
15 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
16 |         if isinstance(data, Sequence):
17 |             return all(isinstance(d, ObjectRef) for d in data)
18 |         return isinstance(data, ObjectRef)
19 | 
20 |     @staticmethod
21 |     def load_data(
22 |         data: Sequence[ObjectRef],
23 |         ignore: Optional[Sequence[str]] = None,
24 |         indices: Optional[Sequence[int]] = None,
25 |         **kwargs
26 |     ) -> pd.DataFrame:
27 |         if indices is not None:
28 |             data = [data[i] for i in indices]
29 | 
30 |         local_df = ray.get(data)
31 | 
32 |         return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore)
33 | 
34 |     @staticmethod
35 |     def convert_to_series(data: Any) -> pd.Series:
36 |         if isinstance(data, ObjectRef):
37 |             data = ray.get(data)
38 |         else:
39 |             data = pd.concat(ray.get(data), copy=False)
40 |         return DataSource.convert_to_series(data)
41 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/pandas.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional, Sequence
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 6 | 
 7 | 
 8 | class Pandas(DataSource):
 9 |     """Read from pandas dataframes and series."""
10 | 
11 |     @staticmethod
12 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
13 |         return isinstance(data, (pd.DataFrame, pd.Series))
14 | 
15 |     @staticmethod
16 |     def load_data(
17 |         data: Any,
18 |         ignore: Optional[Sequence[str]] = None,
19 |         indices: Optional[Sequence[int]] = None,
20 |         **kwargs
21 |     ) -> pd.DataFrame:
22 |         local_df = data
23 | 
24 |         if ignore:
25 |             local_df = local_df[local_df.columns.difference(ignore)]
26 | 
27 |         if indices:
28 |             return local_df.iloc[indices]
29 | 
30 |         return local_df
31 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/parquet.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Iterable, Optional, Sequence, Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 6 | from xgboost_ray.data_sources.pandas import Pandas
 7 | 
 8 | 
 9 | class Parquet(DataSource):
10 |     """Read one or many Parquet files."""
11 | 
12 |     supports_central_loading = True
13 |     supports_distributed_loading = True
14 | 
15 |     @staticmethod
16 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
17 |         return filetype == RayFileType.PARQUET
18 | 
19 |     @staticmethod
20 |     def get_filetype(data: Any) -> Optional[RayFileType]:
21 |         if data.endswith(".parquet"):
22 |             return RayFileType.PARQUET
23 |         return None
24 | 
25 |     @staticmethod
26 |     def load_data(
27 |         data: Union[str, Sequence[str]],
28 |         ignore: Optional[Sequence[str]] = None,
29 |         indices: Optional[Sequence[int]] = None,
30 |         **kwargs
31 |     ) -> pd.DataFrame:
32 |         if isinstance(data, Iterable) and not isinstance(data, str):
33 |             shards = []
34 | 
35 |             for i, shard in enumerate(data):
36 |                 if indices and i not in indices:
37 |                     continue
38 | 
39 |                 shard_df = pd.read_parquet(shard, **kwargs)
40 |                 shards.append(Pandas.load_data(shard_df, ignore=ignore))
41 |             return pd.concat(shards, copy=False)
42 |         else:
43 |             local_df = pd.read_parquet(data, **kwargs)
44 |             return Pandas.load_data(local_df, ignore=ignore)
45 | 
46 |     @staticmethod
47 |     def get_n(data: Any):
48 |         return len(list(data))
49 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/partitioned.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Any, Dict, Optional, Sequence, Tuple
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from ray import ObjectRef
  7 | from ray.actor import ActorHandle
  8 | 
  9 | from xgboost_ray.data_sources._distributed import (
 10 |     assign_partitions_to_actors,
 11 |     get_actor_rank_ips,
 12 | )
 13 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 14 | from xgboost_ray.data_sources.numpy import Numpy
 15 | from xgboost_ray.data_sources.pandas import Pandas
 16 | 
 17 | 
 18 | class Partitioned(DataSource):
 19 |     """Read from distributed data structure implementing __partitioned__.
 20 | 
 21 |     __partitioned__ provides meta data about how the data is partitioned and
 22 |     distributed across several compute nodes, making supporting objects them
 23 |     suitable for distributed loading.
 24 | 
 25 |     Also see the __partitioned__ spec:
 26 |     https://github.com/IntelPython/DPPY-Spec/blob/draft/partitioned/Partitioned.md
 27 |     """
 28 | 
 29 |     supports_central_loading = True
 30 |     supports_distributed_loading = True
 31 | 
 32 |     @staticmethod
 33 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
 34 |         return hasattr(data, "__partitioned__")
 35 | 
 36 |     @staticmethod
 37 |     def load_data(
 38 |         data: Any,  # __partitioned__ dict
 39 |         ignore: Optional[Sequence[str]] = None,
 40 |         indices: Optional[Sequence[ObjectRef]] = None,
 41 |         **kwargs
 42 |     ) -> pd.DataFrame:
 43 | 
 44 |         assert isinstance(data, dict), "Expected __partitioned__ dict"
 45 |         _get = data["get"]
 46 | 
 47 |         if indices is None or len(indices) == 0:
 48 |             tiling = data["partition_tiling"]
 49 |             ndims = len(tiling)
 50 |             # we need tuples to access partitions in the right order
 51 |             pos_suffix = (0,) * (ndims - 1)
 52 |             parts = data["partitions"]
 53 |             # get the full data, e.g. all shards/partitions
 54 |             local_df = [
 55 |                 _get(parts[(i,) + pos_suffix]["data"]) for i in range(tiling[0])
 56 |             ]
 57 |         else:
 58 |             # here we got a list of futures for partitions
 59 |             local_df = _get(indices)
 60 | 
 61 |         if isinstance(local_df[0], pd.DataFrame):
 62 |             return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore)
 63 |         else:
 64 |             return Numpy.load_data(np.concatenate(local_df), ignore=ignore)
 65 | 
 66 |     @staticmethod
 67 |     def get_actor_shards(
 68 |         data: Any, actors: Sequence[ActorHandle]  # partitioned.pandas.DataFrame
 69 |     ) -> Tuple[Any, Optional[Dict[int, Any]]]:
 70 |         assert hasattr(data, "__partitioned__")
 71 | 
 72 |         actor_rank_ips = get_actor_rank_ips(actors)
 73 | 
 74 |         # Get accessor func and partitions
 75 |         parted = data.__partitioned__
 76 |         parts = parted["partitions"]
 77 |         tiling = parted["partition_tiling"]
 78 |         ndims = len(tiling)
 79 |         if ndims < 1 or ndims > 2 or any(tiling[x] != 1 for x in range(1, ndims)):
 80 |             raise RuntimeError(
 81 |                 "Only row-wise partitionings of 1d/2d structures supported."
 82 |             )
 83 | 
 84 |         # Now build a table mapping from IP to list of partitions
 85 |         ip_to_parts = defaultdict(lambda: [])
 86 |         # we need tuples to access partitions in the right order
 87 |         pos_suffix = (0,) * (ndims - 1)
 88 |         for i in range(tiling[0]):
 89 |             part = parts[(i,) + pos_suffix]  # this works for 1d and 2d
 90 |             ip_to_parts[part["location"][0]].append(part["data"])
 91 |         # __partitioned__ is serializable, so pass it here
 92 |         # as the first return value
 93 |         ret = parted, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)
 94 |         return ret
 95 | 
 96 |     @staticmethod
 97 |     def get_n(data: Any):
 98 |         """Get length of data source partitions for sharding."""
 99 |         return data.__partitioned__["shape"][0]
100 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/petastorm.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List, Optional, Sequence, Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
 6 | 
 7 | try:
 8 |     import petastorm
 9 | 
10 |     PETASTORM_INSTALLED = True
11 | except ImportError:
12 |     PETASTORM_INSTALLED = False
13 | 
14 | 
15 | def _assert_petastorm_installed():
16 |     if not PETASTORM_INSTALLED:
17 |         raise RuntimeError(
18 |             "Tried to use Petastorm as a data source, but petastorm is not "
19 |             "installed. This function shouldn't have been called. "
20 |             "\nFIX THIS by installing petastorm: `pip install petastorm`. "
21 |             "\nPlease also raise an issue on our GitHub: "
22 |             "https://github.com/ray-project/xgboost_ray as this part of "
23 |             "the code should not have been reached."
24 |         )
25 | 
26 | 
27 | class Petastorm(DataSource):
28 |     """Read with Petastorm.
29 | 
30 |     `Petastorm <https://github.com/uber/petastorm>`_ is a machine learning
31 |     training and evaluation library.
32 | 
33 |     This class accesses Petastorm's dataset loading interface for efficient
34 |     loading of large datasets.
35 |     """
36 | 
37 |     supports_central_loading = True
38 |     supports_distributed_loading = True
39 | 
40 |     @staticmethod
41 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
42 |         return PETASTORM_INSTALLED and filetype == RayFileType.PETASTORM
43 | 
44 |     @staticmethod
45 |     def get_filetype(data: Any) -> Optional[RayFileType]:
46 |         if not PETASTORM_INSTALLED:
47 |             return None
48 | 
49 |         if not isinstance(data, List):
50 |             data = [data]
51 | 
52 |         def _is_compatible(url: str):
53 |             return url.endswith(".parquet") and (
54 |                 url.startswith("s3://")
55 |                 or url.startswith("gs://")
56 |                 or url.startswith("hdfs://")
57 |                 or url.startswith("file://")
58 |             )
59 | 
60 |         if all(_is_compatible(url) for url in data):
61 |             return RayFileType.PETASTORM
62 | 
63 |         return None
64 | 
65 |     @staticmethod
66 |     def load_data(
67 |         data: Union[str, Sequence[str]],
68 |         ignore: Optional[Sequence[str]] = None,
69 |         indices: Optional[Sequence[int]] = None,
70 |         **kwargs
71 |     ) -> pd.DataFrame:
72 |         _assert_petastorm_installed()
73 |         with petastorm.make_batch_reader(data) as reader:
74 |             shards = [
75 |                 pd.DataFrame(batch._asdict())
76 |                 for i, batch in enumerate(reader)
77 |                 if not indices or i in indices
78 |             ]
79 | 
80 |         local_df = pd.concat(shards, copy=False)
81 | 
82 |         if ignore:
83 |             local_df = local_df[local_df.columns.difference(ignore)]
84 | 
85 |         return local_df
86 | 
87 |     @staticmethod
88 |     def get_n(data: Any):
89 |         return len(list(data))
90 | 


--------------------------------------------------------------------------------
/xgboost_ray/data_sources/ray_dataset.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Optional, Sequence, Tuple, Union
  2 | 
  3 | import pandas as pd
  4 | import ray
  5 | from ray.actor import ActorHandle
  6 | 
  7 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType
  8 | from xgboost_ray.data_sources.pandas import Pandas
  9 | 
 10 | try:
 11 |     import ray.data.dataset  # noqa: F401
 12 | 
 13 |     RAY_DATASET_AVAILABLE = True
 14 | except (ImportError, AttributeError):
 15 |     RAY_DATASET_AVAILABLE = False
 16 | 
 17 | DATASET_TO_PANDAS_LIMIT = float("inf")
 18 | 
 19 | 
 20 | def _assert_ray_data_available():
 21 |     if not RAY_DATASET_AVAILABLE:
 22 |         raise RuntimeError(
 23 |             "Tried to use Ray datasets as a data source, but your version "
 24 |             "of Ray does not support it. "
 25 |             "\nFIX THIS by upgrading Ray: `pip install -U ray`. "
 26 |             "\nPlease also raise an issue on our GitHub: "
 27 |             "https://github.com/ray-project/xgboost_ray as this part of "
 28 |             "the code should not have been reached."
 29 |         )
 30 | 
 31 | 
 32 | class RayDataset(DataSource):
 33 |     """Read from distributed Ray dataset."""
 34 | 
 35 |     supports_central_loading = True
 36 |     supports_distributed_loading = True
 37 |     needs_partitions = False
 38 | 
 39 |     @staticmethod
 40 |     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
 41 |         if not RAY_DATASET_AVAILABLE:
 42 |             return False
 43 | 
 44 |         return isinstance(data, ray.data.dataset.Dataset)
 45 | 
 46 |     @staticmethod
 47 |     def load_data(
 48 |         data: "ray.data.dataset.Dataset",
 49 |         ignore: Optional[Sequence[str]] = None,
 50 |         indices: Optional[
 51 |             Union[Sequence[int], Sequence["ray.data.dataset.Dataset"]]
 52 |         ] = None,
 53 |         **kwargs
 54 |     ) -> pd.DataFrame:
 55 |         _assert_ray_data_available()
 56 | 
 57 |         if indices is not None:
 58 |             if len(indices) > 0 and isinstance(indices[0], ray.data.dataset.Dataset):
 59 |                 # We got a list of Datasets belonging a partition
 60 |                 data = indices
 61 |             else:
 62 |                 data = [data[i] for i in indices]
 63 | 
 64 |         if isinstance(data, ray.data.dataset.Dataset):
 65 |             local_df = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT)
 66 |         else:
 67 |             local_df = pd.concat(
 68 |                 [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False
 69 |             )
 70 |         return Pandas.load_data(local_df, ignore=ignore)
 71 | 
 72 |     @staticmethod
 73 |     def convert_to_series(
 74 |         data: Union["ray.data.dataset.Dataset", Sequence["ray.data.dataset.Dataset"]]
 75 |     ) -> pd.Series:
 76 |         _assert_ray_data_available()
 77 | 
 78 |         if isinstance(data, ray.data.dataset.Dataset):
 79 |             data = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT)
 80 |         else:
 81 |             data = pd.concat(
 82 |                 [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False
 83 |             )
 84 |         return DataSource.convert_to_series(data)
 85 | 
 86 |     @staticmethod
 87 |     def get_actor_shards(
 88 |         data: "ray.data.dataset.Dataset", actors: Sequence[ActorHandle]
 89 |     ) -> Tuple[Any, Optional[Dict[int, Any]]]:
 90 |         _assert_ray_data_available()
 91 | 
 92 |         # We do not use our assign_partitions_to_actors as assignment of splits
 93 |         # to actors is handled by locality_hints argument.
 94 | 
 95 |         dataset_splits = data.split(
 96 |             len(actors),
 97 |             equal=True,
 98 |             locality_hints=actors,
 99 |         )
100 | 
101 |         return None, {
102 |             i: [dataset_split] for i, dataset_split in enumerate(dataset_splits)
103 |         }
104 | 
105 |     @staticmethod
106 |     def get_n(data: "ray.data.dataset.Dataset"):
107 |         """
108 |         Return number of distributed blocks.
109 |         """
110 |         return data._plan.initial_num_blocks()
111 | 


--------------------------------------------------------------------------------
/xgboost_ray/elastic.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Callable, Dict, List, Optional, Tuple
  3 | 
  4 | import ray
  5 | 
  6 | from xgboost_ray.main import (
  7 |     ENV,
  8 |     ActorHandle,
  9 |     RayParams,
 10 |     RayXGBoostActorAvailable,
 11 |     _create_actor,
 12 |     _PrepareActorTask,
 13 |     _TrainingState,
 14 |     logger,
 15 | )
 16 | from xgboost_ray.matrix import RayDMatrix
 17 | 
 18 | 
 19 | def _maybe_schedule_new_actors(
 20 |     training_state: _TrainingState,
 21 |     num_cpus_per_actor: int,
 22 |     num_gpus_per_actor: int,
 23 |     resources_per_actor: Optional[Dict],
 24 |     ray_params: RayParams,
 25 |     load_data: List[RayDMatrix],
 26 | ) -> bool:
 27 |     """Schedule new actors for elastic training if resources are available.
 28 | 
 29 |     Potentially starts new actors and triggers data loading."""
 30 | 
 31 |     # This is only enabled for elastic training.
 32 |     if not ray_params.elastic_training:
 33 |         return False
 34 | 
 35 |     missing_actor_ranks = [
 36 |         rank
 37 |         for rank, actor in enumerate(training_state.actors)
 38 |         if actor is None and rank not in training_state.pending_actors
 39 |     ]
 40 | 
 41 |     # If all actors are alive, there is nothing to do.
 42 |     if not missing_actor_ranks:
 43 |         return False
 44 | 
 45 |     now = time.time()
 46 | 
 47 |     # Check periodically every n seconds.
 48 |     if (
 49 |         now
 50 |         < training_state.last_resource_check_at + ENV.ELASTIC_RESTART_RESOURCE_CHECK_S
 51 |     ):
 52 |         return False
 53 | 
 54 |     training_state.last_resource_check_at = now
 55 | 
 56 |     new_pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = {}
 57 |     for rank in missing_actor_ranks:
 58 |         # Actor rank should not be already pending
 59 |         if rank in training_state.pending_actors or rank in new_pending_actors:
 60 |             continue
 61 | 
 62 |         # Try to schedule this actor
 63 |         actor = _create_actor(
 64 |             rank=rank,
 65 |             num_actors=ray_params.num_actors,
 66 |             num_cpus_per_actor=num_cpus_per_actor,
 67 |             num_gpus_per_actor=num_gpus_per_actor,
 68 |             resources_per_actor=resources_per_actor,
 69 |             placement_group=training_state.placement_group,
 70 |             queue=training_state.queue,
 71 |             checkpoint_frequency=ray_params.checkpoint_frequency,
 72 |             distributed_callbacks=ray_params.distributed_callbacks,
 73 |         )
 74 | 
 75 |         task = _PrepareActorTask(
 76 |             actor,
 77 |             queue=training_state.queue,
 78 |             stop_event=training_state.stop_event,
 79 |             load_data=load_data,
 80 |         )
 81 | 
 82 |         new_pending_actors[rank] = (actor, task)
 83 |         logger.debug(
 84 |             f"Re-scheduled actor with rank {rank}. Waiting for "
 85 |             f"placement and data loading before promoting it "
 86 |             f"to training."
 87 |         )
 88 |     if new_pending_actors:
 89 |         training_state.pending_actors.update(new_pending_actors)
 90 |         logger.info(
 91 |             f"Re-scheduled {len(new_pending_actors)} actors for "
 92 |             f"training. Once data loading finished, they will be "
 93 |             f"integrated into training again."
 94 |         )
 95 |     return bool(new_pending_actors)
 96 | 
 97 | 
 98 | def _update_scheduled_actor_states(training_state: _TrainingState):
 99 |     """Update status of scheduled actors in elastic training.
100 | 
101 |     If actors finished their preparation tasks, promote them to
102 |     proper training actors (set the `training_state.actors` entry).
103 | 
104 |     Also schedule a `RayXGBoostActorAvailable` exception so that training
105 |     is restarted with the new actors.
106 | 
107 |     """
108 |     now = time.time()
109 |     actor_became_ready = False
110 | 
111 |     # Wrap in list so we can alter the `training_state.pending_actors` dict
112 |     for rank in list(training_state.pending_actors.keys()):
113 |         actor, task = training_state.pending_actors[rank]
114 |         if task.is_ready():
115 |             # Promote to proper actor
116 |             training_state.actors[rank] = actor
117 |             del training_state.pending_actors[rank]
118 |             actor_became_ready = True
119 | 
120 |     if actor_became_ready:
121 |         if not training_state.pending_actors:
122 |             # No other actors are pending, so let's restart right away.
123 |             training_state.restart_training_at = now - 1.0
124 | 
125 |         # If an actor became ready but other actors are pending, we wait
126 |         # for n seconds before restarting, as chances are that they become
127 |         # ready as well (e.g. if a large node came up).
128 |         grace_period = ENV.ELASTIC_RESTART_GRACE_PERIOD_S
129 |         if training_state.restart_training_at is None:
130 |             logger.debug(
131 |                 f"A RayXGBoostActor became ready for training. Waiting "
132 |                 f"{grace_period} seconds before triggering training restart."
133 |             )
134 |             training_state.restart_training_at = now + grace_period
135 | 
136 |     if training_state.restart_training_at is not None:
137 |         if now > training_state.restart_training_at:
138 |             training_state.restart_training_at = None
139 |             raise RayXGBoostActorAvailable(
140 |                 "A new RayXGBoostActor became available for training. "
141 |                 "Triggering restart."
142 |             )
143 | 
144 | 
145 | def _get_actor_alive_status(
146 |     actors: List[ActorHandle], callback: Callable[[ActorHandle], None]
147 | ):
148 |     """Loop through all actors. Invoke a callback on dead actors."""
149 |     obj_to_rank = {}
150 | 
151 |     alive = 0
152 |     dead = 0
153 | 
154 |     for rank, actor in enumerate(actors):
155 |         if actor is None:
156 |             dead += 1
157 |             continue
158 |         obj = actor.pid.remote()
159 |         obj_to_rank[obj] = rank
160 | 
161 |     not_ready = list(obj_to_rank.keys())
162 |     while not_ready:
163 |         ready, not_ready = ray.wait(not_ready, timeout=0)
164 | 
165 |         for obj in ready:
166 |             try:
167 |                 pid = ray.get(obj)
168 |                 rank = obj_to_rank[obj]
169 |                 logger.debug(f"Actor {actors[rank]} with PID {pid} is alive.")
170 |                 alive += 1
171 |             except Exception:
172 |                 rank = obj_to_rank[obj]
173 |                 logger.debug(f"Actor {actors[rank]} is _not_ alive.")
174 |                 dead += 1
175 |                 callback(actors[rank])
176 |     logger.info(f"Actor status: {alive} alive, {dead} dead " f"({alive+dead} total)")
177 | 
178 |     return alive, dead
179 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ray-project/xgboost_ray/e9049256575e5bdd956b369cf86e94a298d11048/xgboost_ray/examples/__init__.py


--------------------------------------------------------------------------------
/xgboost_ray/examples/create_test_data.py:
--------------------------------------------------------------------------------
 1 | from xgboost_ray.tests.utils import create_parquet
 2 | 
 3 | 
 4 | def main():
 5 |     create_parquet(
 6 |         "example.parquet",
 7 |         num_rows=1_000_000,
 8 |         num_partitions=100,
 9 |         num_features=8,
10 |         num_classes=2,
11 |     )
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     main()
16 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/higgs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | from xgboost_ray import RayDMatrix, RayParams, train
 5 | 
 6 | FILENAME_CSV = "HIGGS.csv.gz"
 7 | 
 8 | 
 9 | def download_higgs(target_file):
10 |     url = (
11 |         "https://archive.ics.uci.edu/ml/machine-learning-databases/"
12 |         "00280/HIGGS.csv.gz"
13 |     )
14 | 
15 |     try:
16 |         import urllib.request
17 |     except ImportError as e:
18 |         raise ValueError(
19 |             f"Automatic downloading of the HIGGS dataset requires `urllib`."
20 |             f"\nFIX THIS by running `pip install urllib` or manually "
21 |             f"downloading the dataset from {url}."
22 |         ) from e
23 | 
24 |     print(f"Downloading HIGGS dataset to {target_file}")
25 |     urllib.request.urlretrieve(url, target_file)
26 |     return os.path.exists(target_file)
27 | 
28 | 
29 | def main():
30 |     # Example adapted from this blog post:
31 |     # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7
32 |     # This uses the HIGGS dataset. Download here:
33 |     # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
34 | 
35 |     if not os.path.exists(FILENAME_CSV):
36 |         assert download_higgs(FILENAME_CSV), "Downloading of HIGGS dataset failed."
37 |         print("HIGGS dataset downloaded.")
38 |     else:
39 |         print("HIGGS dataset found locally.")
40 | 
41 |     colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]
42 | 
43 |     dtrain = RayDMatrix(os.path.abspath(FILENAME_CSV), label="label", names=colnames)
44 | 
45 |     config = {
46 |         "tree_method": "hist",
47 |         "eval_metric": ["logloss", "error"],
48 |     }
49 | 
50 |     evals_result = {}
51 | 
52 |     start = time.time()
53 |     bst = train(
54 |         config,
55 |         dtrain,
56 |         evals_result=evals_result,
57 |         ray_params=RayParams(max_actor_restarts=1, num_actors=1),
58 |         num_boost_round=100,
59 |         evals=[(dtrain, "train")],
60 |     )
61 |     taken = time.time() - start
62 |     print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
63 | 
64 |     bst.save_model("higgs.xgb")
65 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     import ray
70 | 
71 |     ray.init()
72 | 
73 |     start = time.time()
74 |     main()
75 |     taken = time.time() - start
76 |     print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")
77 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/higgs_parquet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import pandas as pd
  5 | import pyarrow as pa
  6 | import pyarrow.parquet as pq
  7 | from higgs import download_higgs
  8 | 
  9 | from xgboost_ray import RayDMatrix, RayParams, train
 10 | 
 11 | FILENAME_CSV = "HIGGS.csv.gz"
 12 | FILENAME_PARQUET = "HIGGS.parquet"
 13 | 
 14 | 
 15 | def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs):
 16 |     if os.path.exists(out_file):
 17 |         return False
 18 | 
 19 |     print(f"Converting CSV {in_file} to PARQUET {out_file}")
 20 |     csv_stream = pd.read_csv(
 21 |         in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs
 22 |     )
 23 | 
 24 |     parquet_schema = None
 25 |     parquet_writer = None
 26 |     for i, chunk in enumerate(csv_stream):
 27 |         print("Chunk", i)
 28 |         if not parquet_schema:
 29 |             # Guess the schema of the CSV file from the first chunk
 30 |             parquet_schema = pa.Table.from_pandas(df=chunk).schema
 31 |             # Open a Parquet file for writing
 32 |             parquet_writer = pq.ParquetWriter(
 33 |                 out_file, parquet_schema, compression="snappy"
 34 |             )
 35 |         # Write CSV chunk to the parquet file
 36 |         table = pa.Table.from_pandas(chunk, schema=parquet_schema)
 37 |         parquet_writer.write_table(table)
 38 | 
 39 |     parquet_writer.close()
 40 |     return True
 41 | 
 42 | 
 43 | def main():
 44 |     # Example adapted from this blog post:
 45 |     # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7
 46 |     # This uses the HIGGS dataset. Download here:
 47 |     # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
 48 | 
 49 |     if not os.path.exists(FILENAME_PARQUET):
 50 |         if not os.path.exists(FILENAME_CSV):
 51 |             download_higgs(FILENAME_CSV)
 52 |             print("Downloaded HIGGS csv dataset")
 53 |         print("Converting HIGGS csv dataset to parquet")
 54 |         csv_to_parquet(
 55 |             FILENAME_CSV,
 56 |             FILENAME_PARQUET,
 57 |             names=[
 58 |                 "label",
 59 |                 "feature-01",
 60 |                 "feature-02",
 61 |                 "feature-03",
 62 |                 "feature-04",
 63 |                 "feature-05",
 64 |                 "feature-06",
 65 |                 "feature-07",
 66 |                 "feature-08",
 67 |                 "feature-09",
 68 |                 "feature-10",
 69 |                 "feature-11",
 70 |                 "feature-12",
 71 |                 "feature-13",
 72 |                 "feature-14",
 73 |                 "feature-15",
 74 |                 "feature-16",
 75 |                 "feature-17",
 76 |                 "feature-18",
 77 |                 "feature-19",
 78 |                 "feature-20",
 79 |                 "feature-21",
 80 |                 "feature-22",
 81 |                 "feature-23",
 82 |                 "feature-24",
 83 |                 "feature-25",
 84 |                 "feature-26",
 85 |                 "feature-27",
 86 |                 "feature-28",
 87 |             ],
 88 |         )
 89 | 
 90 |     colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)]
 91 | 
 92 |     # Here we load the Parquet file
 93 |     dtrain = RayDMatrix(
 94 |         os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames
 95 |     )
 96 | 
 97 |     config = {
 98 |         "tree_method": "hist",
 99 |         "eval_metric": ["logloss", "error"],
100 |     }
101 | 
102 |     evals_result = {}
103 | 
104 |     start = time.time()
105 |     bst = train(
106 |         config,
107 |         dtrain,
108 |         evals_result=evals_result,
109 |         ray_params=RayParams(max_actor_restarts=1, num_actors=1),
110 |         num_boost_round=100,
111 |         evals=[(dtrain, "train")],
112 |     )
113 |     taken = time.time() - start
114 |     print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
115 | 
116 |     bst.save_model("higgs.xgb")
117 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     import ray
122 | 
123 |     ray.init()
124 | 
125 |     start = time.time()
126 |     main()
127 |     taken = time.time() - start
128 |     print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")
129 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/readme.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa E501
  2 | 
  3 | 
  4 | def readme_simple():
  5 |     from sklearn.datasets import load_breast_cancer
  6 | 
  7 |     from xgboost_ray import RayDMatrix, RayParams, train
  8 | 
  9 |     train_x, train_y = load_breast_cancer(return_X_y=True)
 10 |     train_set = RayDMatrix(train_x, train_y)
 11 | 
 12 |     evals_result = {}
 13 |     bst = train(
 14 |         {
 15 |             "objective": "binary:logistic",
 16 |             "eval_metric": ["logloss", "error"],
 17 |         },
 18 |         train_set,
 19 |         evals_result=evals_result,
 20 |         evals=[(train_set, "train")],
 21 |         verbose_eval=False,
 22 |         ray_params=RayParams(num_actors=2, cpus_per_actor=1),
 23 |     )
 24 | 
 25 |     bst.save_model("model.xgb")
 26 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
 27 | 
 28 | 
 29 | def readme_predict():
 30 |     import xgboost as xgb
 31 |     from sklearn.datasets import load_breast_cancer
 32 | 
 33 |     from xgboost_ray import RayDMatrix, RayParams, predict
 34 | 
 35 |     data, labels = load_breast_cancer(return_X_y=True)
 36 | 
 37 |     dpred = RayDMatrix(data, labels)
 38 | 
 39 |     bst = xgb.Booster(model_file="model.xgb")
 40 |     pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2))
 41 | 
 42 |     print(pred_ray)
 43 | 
 44 | 
 45 | def readme_tune():
 46 |     from sklearn.datasets import load_breast_cancer
 47 | 
 48 |     from xgboost_ray import RayDMatrix, RayParams, train
 49 | 
 50 |     num_actors = 4
 51 |     num_cpus_per_actor = 1
 52 | 
 53 |     ray_params = RayParams(num_actors=num_actors, cpus_per_actor=num_cpus_per_actor)
 54 | 
 55 |     def train_model(config):
 56 |         train_x, train_y = load_breast_cancer(return_X_y=True)
 57 |         train_set = RayDMatrix(train_x, train_y)
 58 | 
 59 |         evals_result = {}
 60 |         bst = train(
 61 |             params=config,
 62 |             dtrain=train_set,
 63 |             evals_result=evals_result,
 64 |             evals=[(train_set, "train")],
 65 |             verbose_eval=False,
 66 |             ray_params=ray_params,
 67 |         )
 68 |         bst.save_model("model.xgb")
 69 | 
 70 |     from ray import tune
 71 | 
 72 |     # Specify the hyperparameter search space.
 73 |     config = {
 74 |         "tree_method": "approx",
 75 |         "objective": "binary:logistic",
 76 |         "eval_metric": ["logloss", "error"],
 77 |         "eta": tune.loguniform(1e-4, 1e-1),
 78 |         "subsample": tune.uniform(0.5, 1.0),
 79 |         "max_depth": tune.randint(1, 9),
 80 |     }
 81 | 
 82 |     # Make sure to use the `get_tune_resources` method to set the `resources_per_trial`
 83 |     analysis = tune.run(
 84 |         train_model,
 85 |         config=config,
 86 |         metric="train-error",
 87 |         mode="min",
 88 |         num_samples=4,
 89 |         resources_per_trial=ray_params.get_tune_resources(),
 90 |     )
 91 |     print("Best hyperparameters", analysis.best_config)
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 |     import ray
 96 | 
 97 |     ray.init(num_cpus=5)
 98 | 
 99 |     print("Readme: Simple example")
100 |     readme_simple()
101 |     readme_predict()
102 |     try:
103 |         print("Readme: Ray Tune example")
104 |         readme_tune()
105 |     except ImportError:
106 |         print("Ray Tune not installed.")
107 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/readme_sklearn_api.py:
--------------------------------------------------------------------------------
 1 | def readme_sklearn_api():
 2 |     from sklearn.datasets import load_breast_cancer
 3 |     from sklearn.model_selection import train_test_split
 4 | 
 5 |     from xgboost_ray import RayParams, RayXGBClassifier
 6 | 
 7 |     seed = 42
 8 | 
 9 |     X, y = load_breast_cancer(return_X_y=True)
10 |     X_train, X_test, y_train, y_test = train_test_split(
11 |         X, y, train_size=0.25, random_state=42
12 |     )
13 | 
14 |     clf = RayXGBClassifier(
15 |         n_jobs=4, random_state=seed  # In XGBoost-Ray, n_jobs sets the number of actors
16 |     )
17 | 
18 |     # scikit-learn API will automatically conver the data
19 |     # to RayDMatrix format as needed.
20 |     # You can also pass X as a RayDMatrix, in which case
21 |     # y will be ignored.
22 | 
23 |     clf.fit(X_train, y_train)
24 | 
25 |     pred_ray = clf.predict(X_test)
26 |     print(pred_ray)
27 | 
28 |     pred_proba_ray = clf.predict_proba(X_test)
29 |     print(pred_proba_ray)
30 | 
31 |     # It is also possible to pass a RayParams object
32 |     # to fit/predict/predict_proba methods - will override
33 |     # n_jobs set during initialization
34 | 
35 |     clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2))
36 | 
37 |     pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2))
38 |     print(pred_ray)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     import ray
43 | 
44 |     ray.init(num_cpus=5)
45 | 
46 |     print("Readme: scikit-learn API example")
47 |     readme_sklearn_api()
48 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import ray
 4 | from sklearn import datasets
 5 | from sklearn.model_selection import train_test_split
 6 | 
 7 | from xgboost_ray import RayDMatrix, RayParams, train
 8 | 
 9 | 
10 | def main(cpus_per_actor, num_actors):
11 |     # Load dataset
12 |     data, labels = datasets.load_breast_cancer(return_X_y=True)
13 |     # Split into train and test set
14 |     train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)
15 | 
16 |     train_set = RayDMatrix(train_x, train_y)
17 |     test_set = RayDMatrix(test_x, test_y)
18 | 
19 |     evals_result = {}
20 | 
21 |     # Set XGBoost config.
22 |     xgboost_params = {
23 |         "tree_method": "approx",
24 |         "objective": "binary:logistic",
25 |         "eval_metric": ["logloss", "error"],
26 |     }
27 | 
28 |     # Train the classifier
29 |     bst = train(
30 |         params=xgboost_params,
31 |         dtrain=train_set,
32 |         evals=[(test_set, "eval")],
33 |         evals_result=evals_result,
34 |         ray_params=RayParams(
35 |             max_actor_restarts=0,
36 |             gpus_per_actor=0,
37 |             cpus_per_actor=cpus_per_actor,
38 |             num_actors=num_actors,
39 |         ),
40 |         verbose_eval=False,
41 |         num_boost_round=10,
42 |     )
43 | 
44 |     model_path = "simple.xgb"
45 |     bst.save_model(model_path)
46 |     print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     parser = argparse.ArgumentParser()
51 |     parser.add_argument(
52 |         "--address", required=False, type=str, help="the address to use for Ray"
53 |     )
54 |     parser.add_argument(
55 |         "--server-address",
56 |         required=False,
57 |         type=str,
58 |         help="Address of the remote server if using Ray Client.",
59 |     )
60 |     parser.add_argument(
61 |         "--cpus-per-actor",
62 |         type=int,
63 |         default=1,
64 |         help="Sets number of CPUs per xgboost training worker.",
65 |     )
66 |     parser.add_argument(
67 |         "--num-actors",
68 |         type=int,
69 |         default=4,
70 |         help="Sets number of xgboost workers to use.",
71 |     )
72 |     parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")
73 | 
74 |     args, _ = parser.parse_known_args()
75 | 
76 |     if args.smoke_test:
77 |         ray.init(num_cpus=args.num_actors)
78 |     elif args.server_address:
79 |         ray.util.connect(args.server_address)
80 |     else:
81 |         ray.init(address=args.address)
82 | 
83 |     main(args.cpus_per_actor, args.num_actors)
84 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_dask.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import ray
  6 | 
  7 | from xgboost_ray import RayDMatrix, RayParams, train
  8 | from xgboost_ray.data_sources.dask import DASK_INSTALLED
  9 | 
 10 | 
 11 | def main(cpus_per_actor, num_actors):
 12 |     if not DASK_INSTALLED:
 13 |         print("Dask is not installed. Install with `pip install dask`")
 14 |         return
 15 | 
 16 |     # Local import so the installation check comes first
 17 |     import dask
 18 |     import dask.dataframe as dd
 19 |     from ray.util.dask import ray_dask_get
 20 | 
 21 |     dask.config.set(scheduler=ray_dask_get)
 22 | 
 23 |     # Generate dataset
 24 |     x = np.repeat(range(8), 16).reshape((32, 4))
 25 |     # Even numbers --> 0, odd numbers --> 1
 26 |     y = np.tile(np.repeat(range(2), 4), 4)
 27 | 
 28 |     # Flip some bits to reduce max accuracy
 29 |     bits_to_flip = np.random.choice(32, size=6, replace=False)
 30 |     y[bits_to_flip] = 1 - y[bits_to_flip]
 31 | 
 32 |     data = pd.DataFrame(x)
 33 |     data["label"] = y
 34 | 
 35 |     # Split into 4 partitions
 36 |     dask_df = dd.from_pandas(data, npartitions=4)
 37 | 
 38 |     train_set = RayDMatrix(dask_df, "label")
 39 | 
 40 |     evals_result = {}
 41 |     # Set XGBoost config.
 42 |     xgboost_params = {
 43 |         "tree_method": "approx",
 44 |         "objective": "binary:logistic",
 45 |         "eval_metric": ["logloss", "error"],
 46 |     }
 47 | 
 48 |     # Train the classifier
 49 |     bst = train(
 50 |         params=xgboost_params,
 51 |         dtrain=train_set,
 52 |         evals=[(train_set, "train")],
 53 |         evals_result=evals_result,
 54 |         ray_params=RayParams(
 55 |             max_actor_restarts=0,
 56 |             gpus_per_actor=0,
 57 |             cpus_per_actor=cpus_per_actor,
 58 |             num_actors=num_actors,
 59 |         ),
 60 |         verbose_eval=False,
 61 |         num_boost_round=10,
 62 |     )
 63 | 
 64 |     model_path = "dask.xgb"
 65 |     bst.save_model(model_path)
 66 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 |     parser = argparse.ArgumentParser()
 71 |     parser.add_argument(
 72 |         "--address", required=False, type=str, help="the address to use for Ray"
 73 |     )
 74 |     parser.add_argument(
 75 |         "--server-address",
 76 |         required=False,
 77 |         type=str,
 78 |         help="Address of the remote server if using Ray Client.",
 79 |     )
 80 |     parser.add_argument(
 81 |         "--cpus-per-actor",
 82 |         type=int,
 83 |         default=1,
 84 |         help="Sets number of CPUs per xgboost training worker.",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--num-actors",
 88 |         type=int,
 89 |         default=4,
 90 |         help="Sets number of xgboost workers to use.",
 91 |     )
 92 |     parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")
 93 | 
 94 |     args, _ = parser.parse_known_args()
 95 | 
 96 |     if args.smoke_test:
 97 |         ray.init(num_cpus=args.num_actors + 1)
 98 |     elif args.server_address:
 99 |         ray.util.connect(args.server_address)
100 |     else:
101 |         ray.init(address=args.address)
102 | 
103 |     main(args.cpus_per_actor, args.num_actors)
104 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_modin.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import ray
  6 | 
  7 | from xgboost_ray import RayDMatrix, RayParams, train
  8 | from xgboost_ray.data_sources.modin import MODIN_INSTALLED
  9 | 
 10 | 
 11 | def main(cpus_per_actor, num_actors):
 12 |     if not MODIN_INSTALLED:
 13 |         print(
 14 |             "Modin is not installed or installed in a version that is not "
 15 |             "compatible with xgboost_ray (< 0.9.0)."
 16 |         )
 17 |         return
 18 | 
 19 |     # Import modin after initializing Ray
 20 |     from modin.distributed.dataframe.pandas import from_partitions
 21 | 
 22 |     # Generate dataset
 23 |     x = np.repeat(range(8), 16).reshape((32, 4))
 24 |     # Even numbers --> 0, odd numbers --> 1
 25 |     y = np.tile(np.repeat(range(2), 4), 4)
 26 | 
 27 |     # Flip some bits to reduce max accuracy
 28 |     bits_to_flip = np.random.choice(32, size=6, replace=False)
 29 |     y[bits_to_flip] = 1 - y[bits_to_flip]
 30 | 
 31 |     data = pd.DataFrame(x)
 32 |     data["label"] = y
 33 | 
 34 |     # Split into 4 partitions
 35 |     partitions = [ray.put(part) for part in np.split(data, 4)]
 36 | 
 37 |     # Create modin df here
 38 |     modin_df = from_partitions(partitions, axis=0)
 39 | 
 40 |     train_set = RayDMatrix(modin_df, "label")
 41 | 
 42 |     evals_result = {}
 43 |     # Set XGBoost config.
 44 |     xgboost_params = {
 45 |         "tree_method": "approx",
 46 |         "objective": "binary:logistic",
 47 |         "eval_metric": ["logloss", "error"],
 48 |     }
 49 | 
 50 |     # Train the classifier
 51 |     bst = train(
 52 |         params=xgboost_params,
 53 |         dtrain=train_set,
 54 |         evals=[(train_set, "train")],
 55 |         evals_result=evals_result,
 56 |         ray_params=RayParams(
 57 |             max_actor_restarts=0,
 58 |             gpus_per_actor=0,
 59 |             cpus_per_actor=cpus_per_actor,
 60 |             num_actors=num_actors,
 61 |         ),
 62 |         verbose_eval=False,
 63 |         num_boost_round=10,
 64 |     )
 65 | 
 66 |     model_path = "modin.xgb"
 67 |     bst.save_model(model_path)
 68 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
 69 | 
 70 | 
 71 | if __name__ == "__main__":
 72 |     parser = argparse.ArgumentParser()
 73 |     parser.add_argument(
 74 |         "--address", required=False, type=str, help="the address to use for Ray"
 75 |     )
 76 |     parser.add_argument(
 77 |         "--server-address",
 78 |         required=False,
 79 |         type=str,
 80 |         help="Address of the remote server if using Ray Client.",
 81 |     )
 82 |     parser.add_argument(
 83 |         "--cpus-per-actor",
 84 |         type=int,
 85 |         default=1,
 86 |         help="Sets number of CPUs per xgboost training worker.",
 87 |     )
 88 |     parser.add_argument(
 89 |         "--num-actors",
 90 |         type=int,
 91 |         default=4,
 92 |         help="Sets number of xgboost workers to use.",
 93 |     )
 94 |     parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")
 95 | 
 96 |     args, _ = parser.parse_known_args()
 97 | 
 98 |     if args.smoke_test:
 99 |         ray.init(num_cpus=args.num_actors + 1)
100 |     elif args.server_address:
101 |         ray.util.connect(args.server_address)
102 |     else:
103 |         ray.init(address=args.address)
104 | 
105 |     main(args.cpus_per_actor, args.num_actors)
106 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_objectstore.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import ray
 6 | 
 7 | from xgboost_ray import RayDMatrix, RayParams, train
 8 | 
 9 | 
10 | def main(cpus_per_actor, num_actors):
11 |     # Generate dataset
12 |     x = np.repeat(range(8), 16).reshape((32, 4))
13 |     # Even numbers --> 0, odd numbers --> 1
14 |     y = np.tile(np.repeat(range(2), 4), 4)
15 | 
16 |     # Flip some bits to reduce max accuracy
17 |     bits_to_flip = np.random.choice(32, size=6, replace=False)
18 |     y[bits_to_flip] = 1 - y[bits_to_flip]
19 | 
20 |     data = pd.DataFrame(x)
21 |     data["label"] = y
22 | 
23 |     # Split into 4 partitions
24 |     partitions = [ray.put(part) for part in np.split(data, 4)]
25 | 
26 |     train_set = RayDMatrix(partitions, "label")
27 | 
28 |     evals_result = {}
29 |     # Set XGBoost config.
30 |     xgboost_params = {
31 |         "tree_method": "approx",
32 |         "objective": "binary:logistic",
33 |         "eval_metric": ["logloss", "error"],
34 |     }
35 | 
36 |     # Train the classifier
37 |     bst = train(
38 |         params=xgboost_params,
39 |         dtrain=train_set,
40 |         evals=[(train_set, "train")],
41 |         evals_result=evals_result,
42 |         ray_params=RayParams(
43 |             max_actor_restarts=0,
44 |             gpus_per_actor=0,
45 |             cpus_per_actor=cpus_per_actor,
46 |             num_actors=num_actors,
47 |         ),
48 |         verbose_eval=False,
49 |         num_boost_round=10,
50 |     )
51 | 
52 |     model_path = "modin.xgb"
53 |     bst.save_model(model_path)
54 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument(
60 |         "--address", required=False, type=str, help="the address to use for Ray"
61 |     )
62 |     parser.add_argument(
63 |         "--server-address",
64 |         required=False,
65 |         type=str,
66 |         help="Address of the remote server if using Ray Client.",
67 |     )
68 |     parser.add_argument(
69 |         "--cpus-per-actor",
70 |         type=int,
71 |         default=1,
72 |         help="Sets number of CPUs per xgboost training worker.",
73 |     )
74 |     parser.add_argument(
75 |         "--num-actors",
76 |         type=int,
77 |         default=4,
78 |         help="Sets number of xgboost workers to use.",
79 |     )
80 |     parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")
81 | 
82 |     args, _ = parser.parse_known_args()
83 | 
84 |     if args.smoke_test:
85 |         ray.init(num_cpus=args.num_actors + 1)
86 |     elif args.server_address:
87 |         ray.util.connect(args.server_address)
88 |     else:
89 |         ray.init(address=args.address)
90 | 
91 |     main(args.cpus_per_actor, args.num_actors)
92 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_partitioned.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | import ray
  5 | from sklearn import datasets
  6 | from sklearn.model_selection import train_test_split
  7 | 
  8 | from xgboost_ray import RayDMatrix, RayParams, train
  9 | 
 10 | nc = 31
 11 | 
 12 | 
 13 | @ray.remote
 14 | class AnActor:
 15 |     """We mimic a distributed DF by having several actors create
 16 |     data which form the global DF.
 17 |     """
 18 | 
 19 |     @ray.method(num_returns=2)
 20 |     def genData(self, rank, nranks, nrows):
 21 |         """Generate global dataset and cut out local piece.
 22 |         In real life each actor would of course directly create local data.
 23 |         """
 24 |         # Load dataset
 25 |         data, labels = datasets.load_breast_cancer(return_X_y=True)
 26 |         # Split into train and test set
 27 |         train_x, _, train_y, _ = train_test_split(data, labels, test_size=0.25)
 28 |         train_y = train_y.reshape((train_y.shape[0], 1))
 29 |         train = np.hstack([train_x, train_y])
 30 |         assert nrows <= train.shape[0]
 31 |         assert nc == train.shape[1]
 32 |         sz = nrows // nranks
 33 |         return train[sz * rank : sz * (rank + 1)], ray.util.get_node_ip_address()
 34 | 
 35 | 
 36 | class Parted:
 37 |     """Class exposing __partitioned__"""
 38 | 
 39 |     def __init__(self, parted):
 40 |         self.__partitioned__ = parted
 41 | 
 42 | 
 43 | def main(cpus_per_actor, num_actors):
 44 |     nr = 424
 45 |     actors = [AnActor.remote() for _ in range(num_actors)]
 46 |     parts = [actors[i].genData.remote(i, num_actors, nr) for i in range(num_actors)]
 47 |     rowsperpart = nr // num_actors
 48 |     nr = rowsperpart * num_actors
 49 |     parted = Parted(
 50 |         {
 51 |             "shape": (nr, nc),
 52 |             "partition_tiling": (num_actors, 1),
 53 |             "get": lambda x: ray.get(x),
 54 |             "partitions": {
 55 |                 (i, 0): {
 56 |                     "start": (i * rowsperpart, 0),
 57 |                     "shape": (rowsperpart, nc),
 58 |                     "data": parts[i][0],
 59 |                     "location": [ray.get(parts[i][1])],
 60 |                 }
 61 |                 for i in range(num_actors)
 62 |             },
 63 |         }
 64 |     )
 65 | 
 66 |     yl = nc - 1
 67 |     # Let's create DMatrix from our __partitioned__ structure
 68 |     train_set = RayDMatrix(parted, f"f{yl}")
 69 | 
 70 |     evals_result = {}
 71 |     # Set XGBoost config.
 72 |     xgboost_params = {
 73 |         "tree_method": "approx",
 74 |         "objective": "binary:logistic",
 75 |         "eval_metric": ["logloss", "error"],
 76 |     }
 77 | 
 78 |     # Train the classifier
 79 |     bst = train(
 80 |         params=xgboost_params,
 81 |         dtrain=train_set,
 82 |         evals=[(train_set, "train")],
 83 |         evals_result=evals_result,
 84 |         ray_params=RayParams(
 85 |             max_actor_restarts=0,
 86 |             gpus_per_actor=0,
 87 |             cpus_per_actor=cpus_per_actor,
 88 |             num_actors=num_actors,
 89 |         ),
 90 |         verbose_eval=False,
 91 |         num_boost_round=10,
 92 |     )
 93 | 
 94 |     model_path = "partitioned.xgb"
 95 |     bst.save_model(model_path)
 96 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     parser = argparse.ArgumentParser()
101 |     parser.add_argument(
102 |         "--address", required=False, type=str, help="the address to use for Ray"
103 |     )
104 |     parser.add_argument(
105 |         "--server-address",
106 |         required=False,
107 |         type=str,
108 |         help="Address of the remote server if using Ray Client.",
109 |     )
110 |     parser.add_argument(
111 |         "--cpus-per-actor",
112 |         type=int,
113 |         default=1,
114 |         help="Sets number of CPUs per xgboost training worker.",
115 |     )
116 |     parser.add_argument(
117 |         "--num-actors",
118 |         type=int,
119 |         default=4,
120 |         help="Sets number of xgboost workers to use.",
121 |     )
122 |     parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")
123 | 
124 |     args, _ = parser.parse_known_args()
125 | 
126 |     if not ray.is_initialized():
127 |         if args.smoke_test:
128 |             ray.init(num_cpus=args.num_actors + 1)
129 |         elif args.server_address:
130 |             ray.util.connect(args.server_address)
131 |         else:
132 |             ray.init(address=args.address)
133 | 
134 |     main(args.cpus_per_actor, args.num_actors)
135 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import xgboost as xgb
 5 | from sklearn import datasets
 6 | 
 7 | from xgboost_ray import RayDMatrix, RayParams, predict
 8 | 
 9 | 
10 | def main():
11 |     if not os.path.exists("simple.xgb"):
12 |         raise ValueError(
13 |             "Model file not found: `simple.xgb`"
14 |             "\nFIX THIS by running `python `simple.py` first to "
15 |             "train the model."
16 |         )
17 | 
18 |     # Load dataset
19 |     data, labels = datasets.load_breast_cancer(return_X_y=True)
20 | 
21 |     dmat_xgb = xgb.DMatrix(data, labels)
22 |     dmat_ray = RayDMatrix(data, labels)
23 | 
24 |     bst = xgb.Booster(model_file="simple.xgb")
25 | 
26 |     pred_xgb = bst.predict(dmat_xgb)
27 |     pred_ray = predict(bst, dmat_ray, ray_params=RayParams(num_actors=2))
28 | 
29 |     np.testing.assert_array_equal(pred_xgb, pred_ray)
30 |     print(pred_ray)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_ray_dataset.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import ray
  6 | from xgboost import DMatrix
  7 | 
  8 | from xgboost_ray import RayDMatrix, RayParams, train
  9 | 
 10 | 
 11 | def main(cpus_per_actor, num_actors):
 12 |     np.random.seed(1234)
 13 |     # Generate dataset
 14 |     x = np.repeat(range(8), 16).reshape((32, 4))
 15 |     # Even numbers --> 0, odd numbers --> 1
 16 |     y = np.tile(np.repeat(range(2), 4), 4)
 17 | 
 18 |     # Flip some bits to reduce max accuracy
 19 |     bits_to_flip = np.random.choice(32, size=6, replace=False)
 20 |     y[bits_to_flip] = 1 - y[bits_to_flip]
 21 | 
 22 |     data = pd.DataFrame(x)
 23 |     # Ray Datasets require all columns to be string
 24 |     data.columns = [str(c) for c in data.columns]
 25 |     data["label"] = y
 26 | 
 27 |     ray_ds = ray.data.from_pandas(data)
 28 |     train_set = RayDMatrix(ray_ds, "label")
 29 | 
 30 |     evals_result = {}
 31 |     # Set XGBoost config.
 32 |     xgboost_params = {
 33 |         "tree_method": "approx",
 34 |         "objective": "binary:logistic",
 35 |         "eval_metric": ["logloss", "error"],
 36 |     }
 37 | 
 38 |     # Train the classifier
 39 |     bst = train(
 40 |         params=xgboost_params,
 41 |         dtrain=train_set,
 42 |         evals=[(train_set, "train")],
 43 |         evals_result=evals_result,
 44 |         ray_params=RayParams(
 45 |             max_actor_restarts=0,
 46 |             gpus_per_actor=0,
 47 |             cpus_per_actor=cpus_per_actor,
 48 |             num_actors=num_actors,
 49 |         ),
 50 |         verbose_eval=False,
 51 |         num_boost_round=10,
 52 |     )
 53 | 
 54 |     model_path = "ray_datasets.xgb"
 55 |     bst.save_model(model_path)
 56 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
 57 | 
 58 |     # Distributed prediction
 59 |     scored = ray_ds.drop_columns(["label"]).map_batches(
 60 |         lambda batch: {"pred": bst.predict(DMatrix(batch))}, batch_format="pandas"
 61 |     )
 62 |     print(scored.to_pandas())
 63 | 
 64 | 
 65 | if __name__ == "__main__":
 66 |     parser = argparse.ArgumentParser()
 67 |     parser.add_argument(
 68 |         "--address", required=False, type=str, help="the address to use for Ray"
 69 |     )
 70 |     parser.add_argument(
 71 |         "--server-address",
 72 |         required=False,
 73 |         type=str,
 74 |         help="Address of the remote server if using Ray Client.",
 75 |     )
 76 |     parser.add_argument(
 77 |         "--cpus-per-actor",
 78 |         type=int,
 79 |         default=1,
 80 |         help="Sets number of CPUs per xgboost training worker.",
 81 |     )
 82 |     parser.add_argument(
 83 |         "--num-actors",
 84 |         type=int,
 85 |         default=4,
 86 |         help="Sets number of xgboost workers to use.",
 87 |     )
 88 |     parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu")
 89 | 
 90 |     args, _ = parser.parse_known_args()
 91 | 
 92 |     if args.smoke_test:
 93 |         ray.init(num_cpus=args.num_actors + 1)
 94 |     elif args.server_address:
 95 |         ray.util.connect(args.server_address)
 96 |     else:
 97 |         ray.init(address=args.address)
 98 | 
 99 |     main(args.cpus_per_actor, args.num_actors)
100 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/simple_tune.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import ray
  5 | from ray import tune
  6 | from sklearn import datasets
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | import xgboost_ray
 10 | from xgboost_ray import RayDMatrix, RayParams, train
 11 | 
 12 | 
 13 | def train_breast_cancer(config, ray_params):
 14 |     # Load dataset
 15 |     data, labels = datasets.load_breast_cancer(return_X_y=True)
 16 |     # Split into train and test set
 17 |     train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)
 18 | 
 19 |     train_set = RayDMatrix(train_x, train_y)
 20 |     test_set = RayDMatrix(test_x, test_y)
 21 | 
 22 |     evals_result = {}
 23 | 
 24 |     bst = train(
 25 |         params=config,
 26 |         dtrain=train_set,
 27 |         evals=[(test_set, "eval")],
 28 |         evals_result=evals_result,
 29 |         ray_params=ray_params,
 30 |         verbose_eval=False,
 31 |         num_boost_round=10,
 32 |     )
 33 | 
 34 |     model_path = "tuned.xgb"
 35 |     bst.save_model(model_path)
 36 |     print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))
 37 | 
 38 | 
 39 | def main(cpus_per_actor, num_actors, num_samples):
 40 |     # Set XGBoost config.
 41 |     config = {
 42 |         "tree_method": "approx",
 43 |         "objective": "binary:logistic",
 44 |         "eval_metric": ["logloss", "error"],
 45 |         "eta": tune.loguniform(1e-4, 1e-1),
 46 |         "subsample": tune.uniform(0.5, 1.0),
 47 |         "max_depth": tune.randint(1, 9),
 48 |     }
 49 | 
 50 |     ray_params = RayParams(
 51 |         max_actor_restarts=1,
 52 |         gpus_per_actor=0,
 53 |         cpus_per_actor=cpus_per_actor,
 54 |         num_actors=num_actors,
 55 |     )
 56 | 
 57 |     analysis = tune.run(
 58 |         tune.with_parameters(train_breast_cancer, ray_params=ray_params),
 59 |         # Use the `get_tune_resources` helper function to set the resources.
 60 |         resources_per_trial=ray_params.get_tune_resources(),
 61 |         config=config,
 62 |         num_samples=num_samples,
 63 |         metric="eval-error",
 64 |         mode="min",
 65 |     )
 66 | 
 67 |     # Load the best model checkpoint.
 68 |     best_bst = xgboost_ray.tune.load_model(
 69 |         os.path.join(analysis.best_trial.local_path, "tuned.xgb")
 70 |     )
 71 | 
 72 |     best_bst.save_model("best_model.xgb")
 73 | 
 74 |     accuracy = 1.0 - analysis.best_result["eval-error"]
 75 |     print(f"Best model parameters: {analysis.best_config}")
 76 |     print(f"Best model total accuracy: {accuracy:.4f}")
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument(
 82 |         "--address", required=False, type=str, help="the address to use for Ray"
 83 |     )
 84 |     parser.add_argument(
 85 |         "--server-address",
 86 |         required=False,
 87 |         type=str,
 88 |         help="Address of the remote server if using Ray Client.",
 89 |     )
 90 |     parser.add_argument(
 91 |         "--cpus-per-actor",
 92 |         type=int,
 93 |         default=1,
 94 |         help="Sets number of CPUs per XGBoost training worker.",
 95 |     )
 96 |     parser.add_argument(
 97 |         "--num-actors",
 98 |         type=int,
 99 |         default=1,
100 |         help="Sets number of XGBoost workers to use.",
101 |     )
102 |     parser.add_argument(
103 |         "--num-samples", type=int, default=4, help="Number of samples to use for Tune."
104 |     )
105 |     parser.add_argument("--smoke-test", action="store_true", default=False)
106 | 
107 |     args, _ = parser.parse_known_args()
108 | 
109 |     if args.smoke_test:
110 |         ray.init(num_cpus=args.num_actors * args.num_samples)
111 |     elif args.server_address:
112 |         ray.util.connect(args.server_address)
113 |     else:
114 |         ray.init(address=args.address)
115 | 
116 |     main(args.cpus_per_actor, args.num_actors, args.num_samples)
117 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/train_on_test_data.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | import time
 5 | 
 6 | from xgboost_ray import RayDMatrix, RayParams, train
 7 | from xgboost_ray.tests.utils import create_parquet_in_tempdir
 8 | 
 9 | ####
10 | # Run `create_test_data.py` first to create a large fake data set.
11 | # Alternatively, run with `--smoke-test` to create an ephemeral small fake
12 | # data set.
13 | ####
14 | 
15 | 
16 | def main(fname, num_actors=2):
17 |     dtrain = RayDMatrix(os.path.abspath(fname), label="labels", ignore=["partition"])
18 | 
19 |     config = {
20 |         "tree_method": "hist",
21 |         "eval_metric": ["logloss", "error"],
22 |     }
23 | 
24 |     evals_result = {}
25 | 
26 |     start = time.time()
27 |     bst = train(
28 |         config,
29 |         dtrain,
30 |         evals_result=evals_result,
31 |         ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors),
32 |         num_boost_round=10,
33 |         evals=[(dtrain, "train")],
34 |     )
35 |     taken = time.time() - start
36 |     print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
37 | 
38 |     bst.save_model("test_data.xgb")
39 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument(
45 |         "--smoke-test",
46 |         action="store_true",
47 |         default=False,
48 |         help="Finish quickly for testing",
49 |     )
50 |     args = parser.parse_args()
51 | 
52 |     temp_dir, path = None, None
53 |     if args.smoke_test:
54 |         temp_dir, path = create_parquet_in_tempdir(
55 |             "smoketest.parquet",
56 |             num_rows=1_000,
57 |             num_features=4,
58 |             num_classes=2,
59 |             num_partitions=2,
60 |         )
61 |     else:
62 |         path = os.path.join(os.path.dirname(__file__), "parted.parquet")
63 | 
64 |     import ray
65 | 
66 |     ray.init()
67 | 
68 |     start = time.time()
69 |     main(path)
70 |     taken = time.time() - start
71 |     print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")
72 | 
73 |     if args.smoke_test:
74 |         shutil.rmtree(temp_dir)
75 | 


--------------------------------------------------------------------------------
/xgboost_ray/examples/train_with_ml_dataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | import time
 5 | 
 6 | from ray.util.data import read_parquet
 7 | 
 8 | from xgboost_ray import RayDMatrix, RayParams, train
 9 | from xgboost_ray.tests.utils import create_parquet_in_tempdir
10 | 
11 | ####
12 | # Run `create_test_data.py` first to create a large fake data set.
13 | # Alternatively, run with `--smoke-test` to create an ephemeral small fake
14 | # data set.
15 | ####
16 | 
17 | 
18 | def main(fname, num_actors=2):
19 |     ml_dataset = read_parquet(fname, num_shards=num_actors)
20 | 
21 |     dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"])
22 | 
23 |     config = {
24 |         "tree_method": "hist",
25 |         "eval_metric": ["logloss", "error"],
26 |     }
27 | 
28 |     evals_result = {}
29 | 
30 |     start = time.time()
31 |     bst = train(
32 |         config,
33 |         dtrain,
34 |         evals_result=evals_result,
35 |         ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors),
36 |         num_boost_round=10,
37 |         evals=[(dtrain, "train")],
38 |     )
39 |     taken = time.time() - start
40 |     print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
41 | 
42 |     bst.save_model("test_data.xgb")
43 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser()
48 |     parser.add_argument(
49 |         "--smoke-test",
50 |         action="store_true",
51 |         default=False,
52 |         help="Finish quickly for testing",
53 |     )
54 |     args = parser.parse_args()
55 | 
56 |     temp_dir, path = None, None
57 |     if args.smoke_test:
58 |         temp_dir, path = create_parquet_in_tempdir(
59 |             "smoketest.parquet",
60 |             num_rows=1_000,
61 |             num_features=4,
62 |             num_classes=2,
63 |             num_partitions=2,
64 |         )
65 |     else:
66 |         path = os.path.join(os.path.dirname(__file__), "parted.parquet")
67 | 
68 |     import ray
69 | 
70 |     ray.init()
71 | 
72 |     start = time.time()
73 |     main(path)
74 |     taken = time.time() - start
75 |     print(f"TOTAL TIME TAKEN: {taken:.2f} seconds")
76 | 
77 |     if args.smoke_test:
78 |         shutil.rmtree(temp_dir)
79 | 


--------------------------------------------------------------------------------
/xgboost_ray/session.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from ray.util.annotations import DeveloperAPI, PublicAPI
 4 | from ray.util.queue import Queue
 5 | 
 6 | 
 7 | @DeveloperAPI
 8 | class RayXGBoostSession:
 9 |     def __init__(self, rank: int, queue: Optional[Queue]):
10 |         self._rank = rank
11 |         self._queue = queue
12 | 
13 |     def get_actor_rank(self):
14 |         return self._rank
15 | 
16 |     def set_queue(self, queue):
17 |         self._queue = queue
18 | 
19 |     def put_queue(self, item):
20 |         if self._queue is None:
21 |             raise ValueError(
22 |                 "Trying to put something into session queue, but queue "
23 |                 "was not initialized. This is probably a bug, please raise "
24 |                 "an issue at https://github.com/ray-project/xgboost_ray"
25 |             )
26 |         self._queue.put((self._rank, item))
27 | 
28 | 
29 | _session = None
30 | 
31 | 
32 | @DeveloperAPI
33 | def init_session(*args, **kwargs):
34 |     global _session
35 |     if _session:
36 |         raise ValueError(
37 |             "Trying to initialize RayXGBoostSession twice."
38 |             "\nFIX THIS by not calling `init_session()` manually."
39 |         )
40 |     _session = RayXGBoostSession(*args, **kwargs)
41 | 
42 | 
43 | @DeveloperAPI
44 | def get_session() -> RayXGBoostSession:
45 |     global _session
46 |     if not _session or not isinstance(_session, RayXGBoostSession):
47 |         raise ValueError(
48 |             "Trying to access RayXGBoostSession from outside an XGBoost run."
49 |             "\nFIX THIS by calling function in `session.py` like "
50 |             "`get_actor_rank()` only from within an XGBoost actor session."
51 |         )
52 |     return _session
53 | 
54 | 
55 | @DeveloperAPI
56 | def set_session_queue(queue: Queue):
57 |     session = get_session()
58 |     session.set_queue(queue)
59 | 
60 | 
61 | @PublicAPI
62 | def get_actor_rank() -> int:
63 |     session = get_session()
64 |     return session.get_actor_rank()
65 | 
66 | 
67 | @PublicAPI
68 | def get_rabit_rank() -> int:
69 |     import xgboost as xgb
70 | 
71 |     try:
72 |         # From xgboost>=1.7.0, rabit is replaced by a collective communicator
73 |         return xgb.collective.get_rank()
74 |     except (ImportError, AttributeError):
75 |         return xgb.rabit.get_rank()
76 | 
77 | 
78 | @PublicAPI
79 | def put_queue(*args, **kwargs):
80 |     session = get_session()
81 |     session.put_queue(*args, **kwargs)
82 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ray-project/xgboost_ray/e9049256575e5bdd956b369cf86e94a298d11048/xgboost_ray/tests/__init__.py


--------------------------------------------------------------------------------
/xgboost_ray/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from functools import partial
 3 | 
 4 | import pytest
 5 | import ray
 6 | 
 7 | try:
 8 |     # Ray 1.3+
 9 |     from ray._private.cluster_utils import Cluster
10 | except ImportError:
11 |     from ray.cluster_utils import Cluster
12 | 
13 | 
14 | def get_default_fixure_system_config():
15 |     system_config = {
16 |         "object_timeout_milliseconds": 200,
17 |         "health_check_initial_delay_ms": 0,
18 |         "health_check_failure_threshold": 10,
19 |         "object_store_full_delay_ms": 100,
20 |     }
21 |     return system_config
22 | 
23 | 
24 | def get_default_fixture_ray_kwargs():
25 |     system_config = get_default_fixure_system_config()
26 |     ray_kwargs = {
27 |         "num_cpus": 1,
28 |         "object_store_memory": 150 * 1024 * 1024,
29 |         "dashboard_port": None,
30 |         "namespace": "default_test_namespace",
31 |         "_system_config": system_config,
32 |     }
33 |     return ray_kwargs
34 | 
35 | 
36 | @contextmanager
37 | def _ray_start_cluster(**kwargs):
38 |     init_kwargs = get_default_fixture_ray_kwargs()
39 |     num_nodes = 0
40 |     do_init = False
41 |     # num_nodes & do_init are not arguments for ray.init, so delete them.
42 |     if "num_nodes" in kwargs:
43 |         num_nodes = kwargs["num_nodes"]
44 |         del kwargs["num_nodes"]
45 |     if "do_init" in kwargs:
46 |         do_init = kwargs["do_init"]
47 |         del kwargs["do_init"]
48 |     elif num_nodes > 0:
49 |         do_init = True
50 |     init_kwargs.update(kwargs)
51 |     cluster = Cluster()
52 |     remote_nodes = []
53 |     for i in range(num_nodes):
54 |         if i > 0 and "_system_config" in init_kwargs:
55 |             del init_kwargs["_system_config"]
56 |         remote_nodes.append(cluster.add_node(**init_kwargs))
57 |         # We assume driver will connect to the head (first node),
58 |         # so ray init will be invoked if do_init is true
59 |         if len(remote_nodes) == 1 and do_init:
60 |             ray.init(address=cluster.address)
61 |     yield cluster
62 |     # The code after the yield will run as teardown code.
63 |     ray.shutdown()
64 |     cluster.shutdown()
65 | 
66 | 
67 | # This fixture will start a cluster with empty nodes.
68 | @pytest.fixture(scope="function")
69 | def ray_start_cluster(request):
70 |     param = getattr(request, "param", {})
71 |     request.cls.ray_start_cluster = partial(_ray_start_cluster, **param)
72 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/env_info.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC2005
 3 | 
 4 | echo "Test environment information"
 5 | echo "----------------------------"
 6 | echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')"
 7 | echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')"
 8 | echo "Installed pip packages:"
 9 | echo "$(python -m pip freeze 2>/dev/null || echo 'Pip not installed')"
10 | echo "----------------------------"
11 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/fault_tolerance.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from collections import defaultdict
  4 | from typing import Dict, Set, Tuple
  5 | 
  6 | import ray
  7 | from ray.actor import ActorHandle
  8 | 
  9 | from xgboost_ray.callback import DistributedCallback
 10 | from xgboost_ray.compat import TrainingCallback
 11 | from xgboost_ray.session import get_actor_rank
 12 | 
 13 | 
 14 | @ray.remote(num_cpus=0)
 15 | class FaultToleranceManager:
 16 |     def __init__(self, start_boost_round: int = 0):
 17 |         self.global_boost_round = start_boost_round
 18 | 
 19 |         # Dict from boost_round -> actor ranks to die
 20 |         self.scheduled_kill: Dict[int, Set[int]] = defaultdict(set)
 21 | 
 22 |         # Dict from actor rank -> starts/ends of boost rounds to sleep
 23 |         self.delayed_return: Dict[int, Set[Tuple[int, int]]] = defaultdict(set)
 24 | 
 25 |         # List of tuples (global_boost_round, actor_boost_round) to log
 26 |         # actor iterations
 27 |         self.training_logs = defaultdict(list)
 28 | 
 29 |     def schedule_kill(self, rank: int, boost_round: int):
 30 |         """Kill an actor when reaching this global boost round"""
 31 |         self.scheduled_kill[boost_round].add(rank)
 32 | 
 33 |     def delay_return(self, rank: int, start_boost_round: int, end_boost_round: int):
 34 |         """Do not allow an actor to finish data loading between these rounds"""
 35 |         self.delayed_return[rank].add((start_boost_round, end_boost_round))
 36 | 
 37 |     def inc_boost_round(self, rank: int):
 38 |         """Increase global boosting round"""
 39 |         if rank == 0:
 40 |             self.global_boost_round += 1
 41 | 
 42 |     def log_iteration(self, rank: int, boost_round: int):
 43 |         """Log iteration"""
 44 |         self.training_logs[rank].append((self.global_boost_round, boost_round))
 45 | 
 46 |     def should_die(self, rank: int):
 47 |         """Returns True if the actor should terminate the training job now."""
 48 |         die = False
 49 |         for round in range(self.global_boost_round + 1):
 50 |             # Loop through all rounds until now to deal with race conditions
 51 |             if rank in self.scheduled_kill[round]:
 52 |                 self.scheduled_kill[round].remove(rank)
 53 |                 die = True
 54 |         return die
 55 | 
 56 |     def should_sleep(self, rank: int):
 57 |         """Returns True if the actor should not finish data loading, yet."""
 58 |         if self.delayed_return[rank]:
 59 |             for start, end in self.delayed_return[rank]:
 60 |                 if start <= self.global_boost_round < end:
 61 |                     return True
 62 |         return False
 63 | 
 64 |     def get_logs(self):
 65 |         return self.training_logs
 66 | 
 67 | 
 68 | class DelayedLoadingCallback(DistributedCallback):
 69 |     """Used to control when actors return to training"""
 70 | 
 71 |     def __init__(self, ft_manager: ActorHandle, reload_data=True, sleep_time=0.5):
 72 |         self.ft_manager = ft_manager
 73 |         self.reload_data = reload_data
 74 |         self.sleep_time = sleep_time
 75 | 
 76 |     def after_data_loading(self, actor, data, *args, **kwargs):
 77 |         print(f"Rank {actor.rank} - after load")
 78 |         while ray.get(self.ft_manager.should_sleep.remote(actor.rank)):
 79 |             time.sleep(self.sleep_time)
 80 |         print(f"Rank {actor.rank} - returning now")
 81 | 
 82 | 
 83 | class DieCallback(TrainingCallback):
 84 |     """Used to control when actors should die during training.
 85 | 
 86 |     Also can add delay to each boosting round.
 87 |     """
 88 | 
 89 |     def __init__(self, ft_manager: ActorHandle, training_delay: float = 0):
 90 |         self.ft_manager = ft_manager
 91 |         self.training_delay = training_delay
 92 |         super(DieCallback, self).__init__()
 93 | 
 94 |     def before_iteration(self, model, epoch, evals_log):
 95 |         if ray.get(self.ft_manager.should_die.remote(get_actor_rank())):
 96 |             pid = os.getpid()
 97 |             print(f"Killing process: {pid}")
 98 |             print(f"Rank {get_actor_rank()} will now die.")
 99 |             time.sleep(1)
100 |             os.kill(pid, 9)
101 |             time.sleep(10)  # Don't continue training, just die
102 | 
103 |     def after_iteration(self, model, epoch, evals_log):
104 |         # ray.get to make sure this is up to date in the next iteration
105 |         ray.get(self.ft_manager.log_iteration.remote(get_actor_rank(), epoch))
106 |         if self.training_delay > 0:
107 |             time.sleep(self.training_delay)
108 |         if get_actor_rank() == 0:
109 |             ray.get(self.ft_manager.inc_boost_round.remote(get_actor_rank()))
110 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/benchmark_cpu_gpu.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import os
  4 | import shutil
  5 | import time
  6 | 
  7 | import ray
  8 | 
  9 | from xgboost_ray import (
 10 |     RayDeviceQuantileDMatrix,
 11 |     RayDMatrix,
 12 |     RayFileType,
 13 |     RayParams,
 14 |     train,
 15 | )
 16 | from xgboost_ray.tests.utils import create_parquet_in_tempdir
 17 | 
 18 | if "OMP_NUM_THREADS" in os.environ:
 19 |     del os.environ["OMP_NUM_THREADS"]
 20 | 
 21 | 
 22 | def train_ray(
 23 |     path,
 24 |     num_workers,
 25 |     num_boost_rounds,
 26 |     num_files=0,
 27 |     regression=False,
 28 |     use_gpu=False,
 29 |     smoke_test=False,
 30 |     ray_params=None,
 31 |     xgboost_params=None,
 32 |     **kwargs,
 33 | ):
 34 |     if num_files:
 35 |         files = sorted(glob.glob(f"{path}/**/*.parquet"))
 36 |         while num_files > len(files):
 37 |             files = files + files
 38 |         path = files[0:num_files]
 39 | 
 40 |     use_device_matrix = False
 41 |     if use_gpu:
 42 |         try:
 43 |             import cupy  # noqa: F401
 44 | 
 45 |             use_device_matrix = True
 46 |         except ImportError:
 47 |             use_device_matrix = False
 48 | 
 49 |     if use_device_matrix:
 50 |         dtrain = RayDeviceQuantileDMatrix(
 51 |             path,
 52 |             num_actors=num_workers,
 53 |             label="labels",
 54 |             ignore=["partition"],
 55 |             filetype=RayFileType.PARQUET,
 56 |         )
 57 |     else:
 58 |         dtrain = RayDMatrix(
 59 |             path,
 60 |             num_actors=num_workers,
 61 |             label="labels",
 62 |             ignore=["partition"],
 63 |             filetype=RayFileType.PARQUET,
 64 |         )
 65 | 
 66 |     config = xgboost_params or {"tree_method": "hist" if not use_gpu else "gpu_hist"}
 67 |     if not regression:
 68 |         # Classification
 69 |         config.update(
 70 |             {
 71 |                 "objective": "binary:logistic",
 72 |                 "eval_metric": ["logloss", "error"],
 73 |             }
 74 |         )
 75 |     else:
 76 |         # Regression
 77 |         config.update(
 78 |             {
 79 |                 "objective": "reg:squarederror",
 80 |                 "eval_metric": ["logloss", "rmse"],
 81 |             }
 82 |         )
 83 | 
 84 |     start = time.time()
 85 |     evals_result = {}
 86 |     bst = train(
 87 |         config,
 88 |         dtrain,
 89 |         evals_result=evals_result,
 90 |         num_boost_round=num_boost_rounds,
 91 |         ray_params=ray_params
 92 |         or RayParams(
 93 |             max_actor_restarts=2,
 94 |             num_actors=num_workers,
 95 |             cpus_per_actor=4 if not smoke_test else 1,
 96 |             gpus_per_actor=0 if not use_gpu else 1,
 97 |         ),
 98 |         evals=[(dtrain, "train")],
 99 |         **kwargs,
100 |     )
101 |     taken = time.time() - start
102 |     print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")
103 | 
104 |     bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu"))
105 |     print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
106 |     return bst, taken
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     parser = argparse.ArgumentParser(description="Process some integers.")
111 | 
112 |     parser.add_argument("num_workers", type=int, help="num workers")
113 |     parser.add_argument("num_rounds", type=int, help="num boost rounds")
114 |     parser.add_argument("num_files", type=int, help="num files")
115 | 
116 |     parser.add_argument(
117 |         "--file", default="/data/parted.parquet", type=str, help="data file"
118 |     )
119 | 
120 |     parser.add_argument(
121 |         "--regression", action="store_true", default=False, help="regression"
122 |     )
123 | 
124 |     parser.add_argument("--gpu", action="store_true", default=False, help="gpu")
125 | 
126 |     parser.add_argument(
127 |         "--smoke-test", action="store_true", default=False, help="smoke test"
128 |     )
129 | 
130 |     args = parser.parse_args()
131 | 
132 |     num_workers = args.num_workers
133 |     num_boost_rounds = args.num_rounds
134 |     num_files = args.num_files
135 |     use_gpu = args.gpu
136 | 
137 |     temp_dir = None
138 |     if args.smoke_test:
139 |         temp_dir, path = create_parquet_in_tempdir(
140 |             filename="smoketest.parquet",
141 |             num_rows=args.num_workers * 500,
142 |             num_features=4,
143 |             num_classes=2,
144 |             num_partitions=args.num_workers * 10,
145 |         )
146 |         use_gpu = False
147 |     else:
148 |         path = args.file
149 |         if not os.path.exists(path):
150 |             raise ValueError(
151 |                 f"Benchmarking data not found: {path}."
152 |                 f"\nFIX THIS by running `python create_test_data.py` first."
153 |             )
154 | 
155 |     init_start = time.time()
156 |     if args.smoke_test:
157 |         ray.init(num_cpus=num_workers)
158 |     else:
159 |         ray.init(address="auto")
160 |     init_taken = time.time() - init_start
161 | 
162 |     full_start = time.time()
163 |     bst, train_taken = train_ray(
164 |         path=path,
165 |         num_workers=num_workers,
166 |         num_boost_rounds=num_boost_rounds,
167 |         num_files=num_files,
168 |         regression=args.regression,
169 |         use_gpu=use_gpu,
170 |         smoke_test=args.smoke_test,
171 |     )
172 |     full_taken = time.time() - full_start
173 |     print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)")
174 | 
175 |     if args.smoke_test:
176 |         shutil.rmtree(temp_dir, ignore_errors=True)
177 |     else:
178 |         with open("res.csv", "at") as fp:
179 |             fp.writelines(
180 |                 [
181 |                     ",".join(
182 |                         [
183 |                             str(e)
184 |                             for e in [
185 |                                 num_workers,
186 |                                 num_files,
187 |                                 int(use_gpu),
188 |                                 num_boost_rounds,
189 |                                 init_taken,
190 |                                 full_taken,
191 |                                 train_taken,
192 |                             ]
193 |                         ]
194 |                     )
195 |                     + "\n"
196 |                 ]
197 |             )
198 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/cluster_cpu.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: xgboost_ray_release_tests_cpu_{{env["NUM_WORKERS"] | default(0)}}
 2 | 
 3 | max_workers: {{env["NUM_WORKERS"] | default(0)}}
 4 | upscaling_speed: 9999
 5 | 
 6 | idle_timeout_minutes: 15
 7 | 
 8 | docker:
 9 |     image: anyscale/ray:nightly
10 |     container_name: ray_container
11 |     pull_before_run: true
12 |     run_options:
13 |       - --privileged
14 | 
15 | provider:
16 |     type: aws
17 |     region: us-west-2
18 |     availability_zone: us-west-2a
19 |     cache_stopped_nodes: false
20 | 
21 | available_node_types:
22 |     cpu_4_ondemand:
23 |         node_config:
24 |             InstanceType: m5.xlarge
25 |         resources: {"CPU": 4}
26 |         min_workers: {{env["NUM_WORKERS"] | default(0)}}
27 |         max_workers: {{env["NUM_WORKERS"] | default(0)}}
28 | 
29 | auth:
30 |     ssh_user: ubuntu
31 | 
32 | head_node_type: cpu_4_ondemand
33 | worker_default_node_type: cpu_4_ondemand
34 | 
35 | file_mounts_sync_continuously: false
36 | 
37 | setup_commands:
38 |     - pip install -U {{env["RAY_WHEEL"] | default("ray")}}
39 |     - pip install dask pytest
40 |     - pip install -U {{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}
41 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/cluster_ft.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: xgboost_ray_release_tests_ft_cluster
 2 | 
 3 | max_workers: 9
 4 | 
 5 | upscaling_speed: 32
 6 | 
 7 | idle_timeout_minutes: 15
 8 | 
 9 | docker:
10 |     image: anyscale/ray:nightly
11 |     container_name: ray_container
12 |     pull_before_run: true
13 | 
14 | provider:
15 |     type: aws
16 |     region: us-west-2
17 |     availability_zone: us-west-2a
18 |     cache_stopped_nodes: false
19 | 
20 | available_node_types:
21 |     cpu_16_ondemand:
22 |         node_config:
23 |             InstanceType: m5.4xlarge
24 |         resources: {"CPU": 16}
25 |         min_workers: 9
26 |         max_workers: 9
27 | 
28 | file_mounts: {
29 |   "/release_tests": "./"
30 | }
31 | 
32 | 
33 | auth:
34 |     ssh_user: ubuntu
35 | 
36 | head_node_type: cpu_16_ondemand
37 | worker_default_node_type: cpu_16_ondemand
38 | 
39 | setup_commands:
40 |     - pip install -U awscli fsspec petastorm s3fs botocore
41 |     - pip install -U {{env["RAY_WHEEL"] | default("ray")}}
42 |     - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh
43 | 
44 | file_mounts_sync_continuously: false
45 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/cluster_gpu.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: xgboost_ray_release_tests_gpu_{{env["NUM_WORKERS"] | default(0)}}
 2 | 
 3 | max_workers: {{env["NUM_WORKERS"] | default(0)}}
 4 | upscaling_speed: 9999
 5 | 
 6 | idle_timeout_minutes: 15
 7 | 
 8 | docker:
 9 |     image: anyscale/ray:nightly-gpu
10 |     container_name: ray_container
11 |     pull_before_run: true
12 |     run_options:
13 |       - --privileged
14 | 
15 | provider:
16 |     type: aws
17 |     region: us-west-2
18 |     availability_zone: us-west-2a
19 |     cache_stopped_nodes: false
20 | 
21 | available_node_types:
22 |     gpu_4_ondemand:
23 |         node_config:
24 |             InstanceType: p2.xlarge
25 |         resources: {"CPU": 4, "GPU": 1}
26 |         min_workers: {{env["NUM_WORKERS"] | default(0)}}
27 |         max_workers: {{env["NUM_WORKERS"] | default(0)}}
28 | 
29 | auth:
30 |     ssh_user: ubuntu
31 | 
32 | head_node_type: gpu_4_ondemand
33 | worker_default_node_type: gpu_4_ondemand
34 | 
35 | file_mounts: {
36 |     "~/xgboost_tests": "."
37 | }
38 | 
39 | file_mounts_sync_continuously: false
40 | 
41 | setup_commands:
42 |     - pip install -U pyarrow cupy-cuda101
43 |     - pip install -U {{env["RAY_WHEEL"] | default("ray")}}
44 |     - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh
45 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/create_learnable_data.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.datasets import make_classification, make_regression
  7 | 
  8 | if __name__ == "__main__":
  9 |     if "OMP_NUM_THREADS" in os.environ:
 10 |         del os.environ["OMP_NUM_THREADS"]
 11 | 
 12 |     parser = argparse.ArgumentParser(description="Create fake data.")
 13 |     parser.add_argument("filename", type=str, default="/data/parted.parquet/")
 14 |     parser.add_argument(
 15 |         "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows"
 16 |     )
 17 |     parser.add_argument(
 18 |         "-p",
 19 |         "--num-partitions",
 20 |         required=False,
 21 |         type=int,
 22 |         default=100,
 23 |         help="num partitions",
 24 |     )
 25 |     parser.add_argument(
 26 |         "-c",
 27 |         "--num-cols",
 28 |         required=False,
 29 |         type=int,
 30 |         default=4,
 31 |         help="num columns (features)",
 32 |     )
 33 |     parser.add_argument(
 34 |         "-C", "--num-classes", required=False, type=int, default=2, help="num classes"
 35 |     )
 36 |     parser.add_argument(
 37 |         "-s", "--seed", required=False, type=int, default=1234, help="random seed"
 38 |     )
 39 |     parser.add_argument(
 40 |         "-T",
 41 |         "--target",
 42 |         required=False,
 43 |         type=float,
 44 |         default=0.8,
 45 |         help="target accuracy",
 46 |     )
 47 | 
 48 |     args = parser.parse_args()
 49 | 
 50 |     seed = int(args.seed)
 51 |     np.random.seed(seed)
 52 | 
 53 |     num_rows = int(args.num_rows)
 54 |     num_cols = int(args.num_cols)
 55 |     num_classes = int(args.num_classes)
 56 |     target = float(args.target)
 57 | 
 58 |     if num_classes > 0:
 59 |         x, y = make_classification(
 60 |             n_samples=num_rows,
 61 |             n_features=num_cols,
 62 |             n_informative=num_cols // 2,
 63 |             n_redundant=num_cols // 10,
 64 |             n_repeated=0,
 65 |             n_classes=num_classes,
 66 |             n_clusters_per_class=2,
 67 |             flip_y=1 - target,
 68 |             random_state=seed,
 69 |         )
 70 |     else:
 71 |         x, y = make_regression(
 72 |             n_samples=num_rows,
 73 |             n_features=num_cols,
 74 |             n_informative=num_cols // 2,
 75 |             n_targets=1,
 76 |             noise=0.1,
 77 |             random_state=seed,
 78 |         )
 79 | 
 80 |     filename = args.filename
 81 |     num_partitions = args.num_partitions
 82 | 
 83 |     data = pd.DataFrame(x, columns=[f"feature_{i}" for i in range(num_cols)])
 84 | 
 85 |     rows_per_partition = np.floor(len(data) / num_partitions)
 86 | 
 87 |     partition_arr = np.repeat(np.arange(num_partitions), repeats=rows_per_partition)
 88 |     if len(partition_arr) < len(data):
 89 |         # If this was not evenly divided, append
 90 |         missing = len(data) - len(partition_arr)
 91 |         partition_arr = np.append(partition_arr, np.arange(missing))
 92 | 
 93 |     partition = pd.Series(partition_arr, copy=False, dtype=np.int32)
 94 | 
 95 |     data["labels"] = y
 96 |     data["partition"] = partition
 97 | 
 98 |     os.makedirs(filename, 0o755, exist_ok=True)
 99 | 
100 |     # Write partition-wise to avoid OOM errors
101 |     for i in range(num_partitions):
102 |         part = data[partition_arr == i]
103 |         part.to_parquet(
104 |             filename,
105 |             partition_cols=["partition"],
106 |             engine="pyarrow",
107 |             partition_filename_cb=lambda key: f"part_{key[0]}.parquet",
108 |         )
109 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/create_test_data.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy as np
 5 | 
 6 | from xgboost_ray.tests.utils import create_parquet
 7 | 
 8 | if __name__ == "__main__":
 9 |     if "OMP_NUM_THREADS" in os.environ:
10 |         del os.environ["OMP_NUM_THREADS"]
11 | 
12 |     parser = argparse.ArgumentParser(description="Create fake data.")
13 |     parser.add_argument(
14 |         "filename", type=str, default="/data/parted.parquet/", help="ray/dask"
15 |     )
16 |     parser.add_argument(
17 |         "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows"
18 |     )
19 |     parser.add_argument(
20 |         "-p",
21 |         "--num-partitions",
22 |         required=False,
23 |         type=int,
24 |         default=100,
25 |         help="num partitions",
26 |     )
27 |     parser.add_argument(
28 |         "-c",
29 |         "--num-cols",
30 |         required=False,
31 |         type=int,
32 |         default=4,
33 |         help="num columns (features)",
34 |     )
35 |     parser.add_argument(
36 |         "-C", "--num-classes", required=False, type=int, default=2, help="num classes"
37 |     )
38 |     parser.add_argument(
39 |         "-s", "--seed", required=False, type=int, default=1234, help="random seed"
40 |     )
41 | 
42 |     args = parser.parse_args()
43 | 
44 |     np.random.seed(args.seed)
45 |     create_parquet(
46 |         args.filename,
47 |         num_rows=int(args.num_rows),
48 |         num_partitions=int(args.num_partitions),
49 |         num_features=int(args.num_cols),
50 |         num_classes=int(args.num_classes),
51 |     )
52 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/custom_objective_metric.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | 
 3 | from xgboost_ray.tests.test_xgboost_api import XGBoostAPITest
 4 | 
 5 | 
 6 | class XGBoostDistributedAPITest(XGBoostAPITest):
 7 |     def _init_ray(self):
 8 |         if not ray.is_initialized():
 9 |             ray.init(address="auto")
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     import sys
14 | 
15 |     import pytest
16 | 
17 |     sys.exit(pytest.main(["-v", f"{__file__}::XGBoostDistributedAPITest"]))
18 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/run_e2e_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "./.anyscale.yaml" ]; then
 4 |   echo "Anyscale project not initialized. Please run 'anyscale init'"
 5 |   exit 1
 6 | fi
 7 | 
 8 | NOW=$(date +%s)
 9 | export SESSION_NAME="xgboost_ray_ci_gpu_${NOW}"
10 | export NUM_WORKERS=3
11 | export XGBOOST_RAY_PACKAGE="git+https://github.com/ray-project/xgboost_ray.git@${GITHUB_SHA:-master}#egg=xgboost_ray"
12 | export NO_TMUX=1
13 | 
14 | ./start_gpu_cluster.sh
15 | ./submit_cpu_gpu_benchmark.sh 4 100 100 --gpu --file /data/classification.parquet
16 | anyscale down "${SESSION_NAME}"
17 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/setup_xgboost.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pip install pytest
 4 | # Uninstall any existing xgboost_ray repositories
 5 | pip uninstall -y xgboost_ray || true
 6 | 
 7 | # Install xgboost package
 8 | pip install -U "${XGBOOST_RAY_PACKAGE:-xgboost_ray}"
 9 | 
10 | # Create test dataset
11 | sudo mkdir -p /data || true
12 | sudo chown ray:1000 /data || true
13 | rm -rf /data/classification.parquet || true
14 | cp -R /tmp/ray_tmp_mount/xgboost_tests ~/xgboost_tests || echo "Copy failed"
15 | python ~/xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2
16 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/start_cpu_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "./.anyscale.yaml" ]; then
 4 |   echo "Anyscale project not initialized. Please run 'anyscale init'"
 5 |   exit 1
 6 | fi
 7 | 
 8 | export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}"
 9 | export NUM_WORKERS="${NUM_WORKERS:-3}"
10 | 
11 | SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_cpu_$(date +%s)}
12 | 
13 | echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)"
14 | echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}"
15 | 
16 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_cpu.yaml ${SESSION_NAME}"
17 | 
18 | echo "Running: ${CMD}"
19 | ${CMD}
20 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/start_ft_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "./.anyscale.yaml" ]; then
 4 |   echo "Anyscale project not initialized. Please run 'anyscale init'"
 5 |   exit 1
 6 | fi
 7 | 
 8 | export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}"
 9 | 
10 | SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_ft_$(date +%s)}
11 | 
12 | echo "Starting FT cluster"
13 | echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}"
14 | 
15 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_ft.yaml ${SESSION_NAME}"
16 | 
17 | echo "Running: ${CMD}"
18 | ${CMD}
19 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/start_gpu_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "./.anyscale.yaml" ]; then
 4 |   echo "Anyscale project not initialized. Please run 'anyscale init'"
 5 |   exit 1
 6 | fi
 7 | 
 8 | export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}"
 9 | export NUM_WORKERS="${NUM_WORKERS:-3}"
10 | 
11 | SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_gpu_$(date +%s)}
12 | 
13 | echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)"
14 | echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}"
15 | 
16 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_gpu.yaml ${SESSION_NAME}"
17 | 
18 | echo "Running: ${CMD}"
19 | ${CMD}
20 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "./.anyscale.yaml" ]; then
 4 |   echo "Anyscale project not initialized. Please run 'anyscale init'"
 5 |   exit 1
 6 | fi
 7 | 
 8 | ANYSCALE_CMD="python ~/xgboost_tests/benchmark_cpu_gpu.py $*"
 9 | 
10 | SESSION_STR=""
11 | if [ -n "${SESSION_NAME}" ]; then
12 |   SESSION_STR="--session-name ${SESSION_NAME}"
13 | fi
14 | 
15 | TMUX="--tmux"
16 | if [ "${NO_TMUX}" = "1" ]; then
17 |   TMUX=""
18 | fi
19 | 
20 | CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}"
21 | 
22 | echo "Running: ${CMD}"
23 | ${CMD}
24 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/submit_ft_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "./.anyscale.yaml" ]; then
 4 |   echo "Anyscale project not initialized. Please run 'anyscale init'"
 5 |   exit 1
 6 | fi
 7 | 
 8 | ANYSCALE_CMD="python ~/xgboost_tests/benchmark_ft.py $*"
 9 | 
10 | SESSION_STR=""
11 | if [ -n "${SESSION_NAME}" ]; then
12 |   SESSION_STR="--session-name ${SESSION_NAME}"
13 | fi
14 | 
15 | TMUX="--tmux"
16 | if [ "${NO_TMUX}" = "1" ]; then
17 |   TMUX=""
18 | fi
19 | 
20 | CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}"
21 | 
22 | echo "Running: ${CMD}"
23 | ${CMD}
24 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/tune_cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: xgboost_ray_release_tests_tune
 2 | min_workers: 4
 3 | max_workers: 4
 4 | initial_workers: 4
 5 | autoscaling_mode: default
 6 | docker:
 7 |     image: "rayproject/ray:latest"
 8 |     container_name: ray_container
 9 |     pull_before_run: false
10 |     run_options:
11 |         - --privileged
12 | target_utilization_fraction: 0.8
13 | idle_timeout_minutes: 5
14 | provider:
15 |     type: aws
16 |     region: us-west-2
17 |     availability_zone: us-west-2a
18 |     cache_stopped_nodes: true
19 | auth:
20 |     ssh_user: ubuntu
21 | head_node:
22 |     InstanceType: m5.xlarge
23 |     ImageId: ami-05ac7a76b4c679a79
24 | worker_nodes:
25 |     InstanceType: m5.xlarge
26 |     ImageId: ami-05ac7a76b4c679a79
27 |     InstanceMarketOptions:
28 |         MarketType: spot
29 | 
30 | file_mounts: {
31 |   "/release_tests": "./"
32 | }
33 | cluster_synced_files: []
34 | file_mounts_sync_continuously: true
35 | initialization_commands: []
36 | setup_commands:
37 |     - pip install -U ray
38 |     - pip install -U git+https://github.com/ray-project/xgboost_ray#egg=xgboost-ray
39 |     - pip install -U git+https://github.com/amogkam/xgboost_ray.git@colocation#egg=xgboost-ray
40 |     - mkdir -p /data
41 |     - rm -rf /data/tune_test.parquet || true
42 |     - python /release_tests/create_test_data.py /data/tune_test.parquet --seed 1234 --num-rows 2000 --num-cols 4 --num-partitions 40 --num-classes 2
43 | head_setup_commands: []
44 | worker_setup_commands: []
45 | head_start_ray_commands:
46 |     - ray stop
47 |     - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"actor_cpus\": 0}'"
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"actor_cpus\": 4}'"
51 | metadata:
52 |     anyscale:
53 |         working_dir: "/release_tests"
54 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/release/tune_placement.py:
--------------------------------------------------------------------------------
  1 | """
  2 | NOTE: This example is currently broken (very outdated) and not run in CI.
  3 | 
  4 | Test Ray Tune trial placement across cluster nodes.
  5 | 
  6 | Example: Run this script on a cluster with 4 workers nodes a 4 CPUs.
  7 | 
  8 |     ray up -y tune_cluster.yaml
  9 | 
 10 |     ray attach tune_cluster.yaml
 11 | 
 12 |     python /release_tests/tune_placement.py 4 4 10 10 --fake-data
 13 | 
 14 | This starts 4 trials à 4 actors training 10 boost rounds on 10 data
 15 | partitions per actor. This will use fake data created before training.
 16 | 
 17 | This test will then confirm that actors of the same trial are PACKed
 18 | on the same nodes. In practice we check that each node IP address only
 19 | hosts actors of the same Ray Tune trial.
 20 | """
 21 | 
 22 | import argparse
 23 | import json
 24 | import os
 25 | import shutil
 26 | import tempfile
 27 | import time
 28 | from collections import defaultdict
 29 | 
 30 | import ray
 31 | import ray.train
 32 | from benchmark_cpu_gpu import train_ray
 33 | from ray import tune
 34 | from ray.tune.integration.docker import DockerSyncer
 35 | from ray.tune.session import get_trial_id
 36 | from ray.util import get_node_ip_address
 37 | 
 38 | from xgboost_ray import RayParams
 39 | from xgboost_ray.compat import TrainingCallback
 40 | from xgboost_ray.session import put_queue
 41 | from xgboost_ray.tests.utils import create_parquet
 42 | 
 43 | if "OMP_NUM_THREADS" in os.environ:
 44 |     del os.environ["OMP_NUM_THREADS"]
 45 | 
 46 | 
 47 | class PlacementCallback(TrainingCallback):
 48 |     """This callback collects the Ray Tune trial ID and node IP"""
 49 | 
 50 |     def before_training(self, model):
 51 |         ip_address = get_node_ip_address()
 52 |         put_queue(ip_address)
 53 |         return model
 54 | 
 55 |     def after_iteration(self, model, epoch, evals_log):
 56 |         if epoch == 1:
 57 |             time.sleep(2)
 58 |         elif epoch == 2:
 59 |             time.sleep(8)
 60 | 
 61 | 
 62 | def tune_test(
 63 |     path,
 64 |     num_trials,
 65 |     num_workers,
 66 |     num_boost_rounds,
 67 |     num_files=0,
 68 |     regression=False,
 69 |     use_gpu=False,
 70 |     fake_data=False,
 71 |     smoke_test=False,
 72 | ):
 73 |     ray_params = RayParams(
 74 |         elastic_training=False,
 75 |         max_actor_restarts=0,
 76 |         num_actors=num_workers,
 77 |         cpus_per_actor=1,
 78 |         gpus_per_actor=0 if not use_gpu else 1,
 79 |     )
 80 | 
 81 |     def local_train(config):
 82 |         temp_dir = None
 83 |         if fake_data or smoke_test:
 84 |             temp_dir = "/tmp/release_test_data"
 85 |             if os.path.exists(temp_dir):
 86 |                 shutil.rmtree(temp_dir)
 87 | 
 88 |             os.makedirs(temp_dir, 0o755)
 89 |             local_path = os.path.join(temp_dir, "smoketest.parquet")
 90 | 
 91 |             create_parquet(
 92 |                 filename=local_path,
 93 |                 num_rows=args.num_workers * 500,
 94 |                 num_features=4,
 95 |                 num_classes=2,
 96 |                 num_partitions=args.num_workers * 10,
 97 |             )
 98 |         else:
 99 |             if not os.path.exists(path):
100 |                 raise ValueError(
101 |                     f"Benchmarking data not found: {path}."
102 |                     f"\nFIX THIS by running `python create_test_data.py` "
103 |                     f"on all nodes first."
104 |                 )
105 |             local_path = path
106 | 
107 |         xgboost_params = {
108 |             "tree_method": "hist" if not use_gpu else "gpu_hist",
109 |         }
110 | 
111 |         xgboost_params.update(
112 |             {
113 |                 "objective": "binary:logistic",
114 |                 "eval_metric": ["logloss", "error"],
115 |             }
116 |         )
117 | 
118 |         xgboost_params.update(config)
119 | 
120 |         additional_results = {}
121 | 
122 |         bst, time_taken = train_ray(
123 |             path=local_path,
124 |             num_workers=num_workers,
125 |             num_boost_rounds=num_boost_rounds,
126 |             num_files=num_files,
127 |             regression=regression,
128 |             use_gpu=use_gpu,
129 |             smoke_test=smoke_test,
130 |             ray_params=ray_params,
131 |             xgboost_params=xgboost_params,
132 |             # kwargs
133 |             additional_results=additional_results,
134 |             callbacks=[PlacementCallback()],
135 |         )
136 | 
137 |         bst.save_model("tuned.xgb")
138 | 
139 |         trial_ips = []
140 |         for rank, ips in enumerate(additional_results["callback_returns"]):
141 |             for ip in ips:
142 |                 trial_ips.append(ip)
143 | 
144 |         tune_trial = get_trial_id()
145 |         with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
146 |             with open(
147 |                 os.path.join(temp_checkpoint_dir, "callback_returns.json"), "wt"
148 |             ) as f:
149 |                 json.dump({tune_trial: trial_ips}, f)
150 |             ray.train.report(
151 |                 {}, checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir)
152 |             )
153 | 
154 |         if temp_dir:
155 |             shutil.rmtree(temp_dir)
156 | 
157 |     search_space = {
158 |         "eta": tune.loguniform(1e-4, 1e-1),
159 |         "subsample": tune.uniform(0.5, 1.0),
160 |         "max_depth": tune.randint(1, 9),
161 |     }
162 | 
163 |     analysis = tune.run(
164 |         local_train,
165 |         config=search_space,
166 |         num_samples=num_trials,
167 |         sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer),
168 |         resources_per_trial=ray_params.get_tune_resources(),
169 |     )
170 | 
171 |     # In our PACK scheduling, we expect that each IP hosts only workers
172 |     # for one Ray Tune trial.
173 |     ip_to_trials = defaultdict(list)
174 |     for trial in analysis.trials:
175 |         trial = trial
176 |         with open(
177 |             os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt"
178 |         ) as f:
179 |             trial_to_ips = json.load(f)
180 |         for tune_trial, ips in trial_to_ips.items():
181 |             for node_ip in ips:
182 |                 ip_to_trials[node_ip].append(tune_trial)
183 | 
184 |     fail = False
185 |     for ip, trial_ids in ip_to_trials.items():
186 |         print(f"For IP {ip} got trial IDs {trial_ids}")
187 |         fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids)
188 | 
189 |     if fail:
190 |         raise ValueError("Different trial IDs found on same node.")
191 |     else:
192 |         print("Success.")
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     parser = argparse.ArgumentParser(description="Test Ray Tune placement " "strategy")
197 | 
198 |     parser.add_argument("num_trials", type=int, help="num trials")
199 |     parser.add_argument("num_workers", type=int, help="num workers (per trial)")
200 |     parser.add_argument("num_rounds", type=int, help="num boost rounds")
201 |     parser.add_argument("num_files", type=int, help="num files (per trial)")
202 | 
203 |     parser.add_argument(
204 |         "--file", default="/data/parted.parquet", type=str, help="data file"
205 |     )
206 | 
207 |     parser.add_argument(
208 |         "--regression", action="store_true", default=False, help="regression"
209 |     )
210 | 
211 |     parser.add_argument("--gpu", action="store_true", default=False, help="gpu")
212 | 
213 |     parser.add_argument(
214 |         "--fake-data", action="store_true", default=False, help="fake data"
215 |     )
216 | 
217 |     parser.add_argument(
218 |         "--smoke-test", action="store_true", default=False, help="smoke test"
219 |     )
220 | 
221 |     args = parser.parse_args()
222 | 
223 |     num_trials = args.num_trials
224 |     num_workers = args.num_workers
225 |     num_boost_rounds = args.num_rounds
226 |     num_files = args.num_files
227 |     use_gpu = args.gpu
228 | 
229 |     if args.smoke_test:
230 |         use_gpu = False
231 | 
232 |     init_start = time.time()
233 |     if args.smoke_test:
234 |         ray.init(num_cpus=num_workers)
235 |     else:
236 |         ray.init(address="auto")
237 | 
238 |     full_start = time.time()
239 |     tune_test(
240 |         path=args.file,
241 |         num_trials=num_trials,
242 |         num_workers=num_workers,
243 |         num_boost_rounds=num_boost_rounds,
244 |         num_files=num_files,
245 |         regression=args.regression,
246 |         use_gpu=use_gpu,
247 |         fake_data=args.fake_data,
248 |         smoke_test=args.smoke_test,
249 |     )
250 |     full_taken = time.time() - full_start
251 |     print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds ")
252 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/test_client.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | import ray
  5 | from ray.util.client.ray_client_helpers import ray_start_client_server
  6 | 
  7 | from xgboost_ray.data_sources.ray_dataset import RAY_DATASET_AVAILABLE
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def start_client_server_4_cpus():
 12 |     ray.init(num_cpus=4)
 13 |     with ray_start_client_server() as client:
 14 |         yield client
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def start_client_server_5_cpus():
 19 |     ray.init(num_cpus=5)
 20 |     with ray_start_client_server() as client:
 21 |         yield client
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def start_client_server_5_cpus_modin(monkeypatch):
 26 |     monkeypatch.setenv("__MODIN_AUTOIMPORT_PANDAS__", "1")
 27 |     ray.init(num_cpus=5, runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}})
 28 |     with ray_start_client_server() as client:
 29 |         yield client
 30 | 
 31 | 
 32 | def test_simple_train(start_client_server_4_cpus):
 33 |     assert ray.util.client.ray.is_connected()
 34 |     from xgboost_ray.examples.simple import main
 35 | 
 36 |     main(num_actors=4, cpus_per_actor=1)
 37 | 
 38 | 
 39 | @pytest.mark.skipif(os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests")
 40 | def test_simple_tune(start_client_server_4_cpus):
 41 |     assert ray.util.client.ray.is_connected()
 42 |     from xgboost_ray.examples.simple_tune import main
 43 | 
 44 |     main(cpus_per_actor=1, num_actors=1, num_samples=4)
 45 | 
 46 | 
 47 | def test_simple_dask(start_client_server_5_cpus):
 48 |     assert ray.util.client.ray.is_connected()
 49 |     from xgboost_ray.examples.simple_dask import main
 50 | 
 51 |     main(cpus_per_actor=1, num_actors=4)
 52 | 
 53 | 
 54 | def test_simple_modin(start_client_server_5_cpus_modin):
 55 |     assert ray.util.client.ray.is_connected()
 56 |     from xgboost_ray.examples.simple_modin import main
 57 | 
 58 |     main(cpus_per_actor=1, num_actors=4)
 59 | 
 60 | 
 61 | def test_client_actor_cpus(start_client_server_5_cpus):
 62 |     assert ray.util.client.ray.is_connected()
 63 |     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 64 | 
 65 |     @ray.remote
 66 |     class DummyTrainActor:
 67 |         def test(self):
 68 |             import xgboost_ray
 69 | 
 70 |             return xgboost_ray.main._ray_get_actor_cpus()
 71 | 
 72 |     actor = DummyTrainActor.options(num_cpus=2).remote()
 73 |     assert ray.get(actor.test.remote()) == 2
 74 | 
 75 |     pg = ray.util.placement_group([{"CPU": 2}])
 76 |     ray.get(pg.ready())
 77 |     actor2 = DummyTrainActor.options(
 78 |         num_cpus=2,
 79 |         scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg),
 80 |     ).remote()
 81 |     assert ray.get(actor2.test.remote()) == 2
 82 | 
 83 | 
 84 | @pytest.mark.skipif(
 85 |     not RAY_DATASET_AVAILABLE,
 86 |     reason="Ray datasets are not available in this version of Ray",
 87 | )
 88 | def test_simple_ray_dataset(start_client_server_5_cpus):
 89 |     assert ray.util.client.ray.is_connected()
 90 |     from xgboost_ray.examples.simple_ray_dataset import main
 91 | 
 92 |     main(cpus_per_actor=1, num_actors=4)
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     import sys
 97 | 
 98 |     import pytest  # noqa: F811
 99 | 
100 |     sys.exit(pytest.main(["-v", __file__]))
101 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/test_colocation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | import unittest
  5 | from unittest.mock import patch
  6 | 
  7 | import numpy as np
  8 | import pytest
  9 | import ray
 10 | from ray.util.queue import _QueueActor
 11 | 
 12 | from xgboost_ray import RayDMatrix, RayParams, train
 13 | from xgboost_ray.main import _train
 14 | from xgboost_ray.util import _EventActor
 15 | 
 16 | 
 17 | class _MockQueueActor(_QueueActor):
 18 |     def get_node_id(self):
 19 |         return ray.get_runtime_context().get_node_id()
 20 | 
 21 | 
 22 | class _MockEventActor(_EventActor):
 23 |     def get_node_id(self):
 24 |         return ray.get_runtime_context().get_node_id()
 25 | 
 26 | 
 27 | @pytest.mark.usefixtures("ray_start_cluster")
 28 | class TestColocation(unittest.TestCase):
 29 |     def setUp(self) -> None:
 30 |         repeat = 8  # Repeat data a couple of times for stability
 31 |         self.x = np.array(
 32 |             [
 33 |                 [1, 0, 0, 0],  # Feature 0 -> Label 0
 34 |                 [0, 1, 0, 0],  # Feature 1 -> Label 1
 35 |                 [0, 0, 1, 1],  # Feature 2+3 -> Label 0
 36 |                 [0, 0, 1, 0],  # Feature 2+!3 -> Label 1
 37 |             ]
 38 |             * repeat
 39 |         )
 40 |         self.y = np.array([0, 1, 0, 1] * repeat)
 41 | 
 42 |         self.params = {
 43 |             "booster": "gbtree",
 44 |             "tree_method": "hist",
 45 |             "nthread": 1,
 46 |             "max_depth": 2,
 47 |             "objective": "binary:logistic",
 48 |             "seed": 1000,
 49 |         }
 50 | 
 51 |         self.kwargs = {}
 52 | 
 53 |         self.tmpdir = str(tempfile.mkdtemp())
 54 | 
 55 |         self.die_lock_file = "/tmp/died_worker.lock"
 56 |         if os.path.exists(self.die_lock_file):
 57 |             os.remove(self.die_lock_file)
 58 | 
 59 |     def tearDown(self) -> None:
 60 |         if os.path.exists(self.tmpdir):
 61 |             shutil.rmtree(self.tmpdir)
 62 |         ray.shutdown()
 63 | 
 64 |     @patch("ray.util.queue._QueueActor", _MockQueueActor)
 65 |     @patch("xgboost_ray.util._EventActor", _MockEventActor)
 66 |     def test_communication_colocation(self):
 67 |         """Checks that Queue and Event actors are colocated with the driver."""
 68 |         os.environ["RXGB_COMMUNICATION_SOFT_PLACEMENT"] = "0"
 69 | 
 70 |         with self.ray_start_cluster() as cluster:
 71 |             cluster.add_node(num_cpus=3)
 72 |             cluster.add_node(num_cpus=3)
 73 |             cluster.wait_for_nodes()
 74 |             ray.init(address=cluster.address)
 75 | 
 76 |             local_node = ray.get_runtime_context().get_node_id()
 77 | 
 78 |             # Note that these will have the same IP in the test cluster
 79 |             assert len(ray.nodes()) == 2
 80 |             assert local_node in [node["NodeID"] for node in ray.nodes()]
 81 | 
 82 |             def _mock_train(*args, _training_state, **kwargs):
 83 |                 assert (
 84 |                     ray.get(_training_state.queue.actor.get_node_id.remote())
 85 |                     == local_node
 86 |                 )
 87 |                 assert (
 88 |                     ray.get(_training_state.stop_event.actor.get_node_id.remote())
 89 |                     == local_node
 90 |                 )
 91 |                 return _train(*args, _training_state=_training_state, **kwargs)
 92 | 
 93 |             with patch("xgboost_ray.main._train") as mocked:
 94 |                 mocked.side_effect = _mock_train
 95 |                 train(
 96 |                     self.params,
 97 |                     RayDMatrix(self.x, self.y),
 98 |                     num_boost_round=2,
 99 |                     ray_params=RayParams(max_actor_restarts=1, num_actors=6),
100 |                 )
101 | 
102 |         os.environ.pop("RXGB_COMMUNICATION_SOFT_PLACEMENT", None)
103 | 
104 |     def test_no_tune_spread(self):
105 |         """Tests whether workers are spread when not using Tune."""
106 |         with self.ray_start_cluster() as cluster:
107 |             cluster.add_node(num_cpus=2)
108 |             cluster.add_node(num_cpus=2)
109 |             cluster.wait_for_nodes()
110 |             ray.init(address=cluster.address)
111 | 
112 |             ray_params = RayParams(max_actor_restarts=1, num_actors=2, cpus_per_actor=2)
113 | 
114 |             def _mock_train(*args, _training_state, **kwargs):
115 |                 try:
116 |                     results = _train(*args, _training_state=_training_state, **kwargs)
117 |                     return results
118 |                 except Exception:
119 |                     raise
120 |                 finally:
121 |                     assert len(_training_state.actors) == 2
122 |                     if not any(a is None for a in _training_state.actors):
123 |                         actor_infos = ray.state.actors()
124 |                         actor_nodes = []
125 |                         for a in _training_state.actors:
126 |                             actor_info = actor_infos.get(a._actor_id.hex())
127 |                             actor_node = actor_info["Address"]["NodeID"]
128 |                             actor_nodes.append(actor_node)
129 |                         assert actor_nodes[0] != actor_nodes[1]
130 | 
131 |             with patch("xgboost_ray.main._train", _mock_train):
132 |                 train(
133 |                     self.params,
134 |                     RayDMatrix(self.x, self.y),
135 |                     num_boost_round=4,
136 |                     ray_params=ray_params,
137 |                 )
138 | 
139 |     def test_tune_pack(self):
140 |         """Tests whether workers are packed when using Tune."""
141 |         try:
142 |             from ray import tune
143 |         except ImportError:
144 |             self.skipTest("Tune is not installed.")
145 |             return
146 |         with self.ray_start_cluster() as cluster:
147 |             num_actors = 2
148 |             cluster.add_node(num_cpus=3)
149 |             cluster.add_node(num_cpus=3)
150 |             ray.init(address=cluster.address)
151 | 
152 |             ray_params = RayParams(
153 |                 max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1
154 |             )
155 | 
156 |             def _mock_train(*args, _training_state, **kwargs):
157 |                 try:
158 |                     results = _train(*args, _training_state=_training_state, **kwargs)
159 |                     return results
160 |                 except Exception:
161 |                     raise
162 |                 finally:
163 |                     assert len(_training_state.actors) == num_actors
164 |                     if not any(a is None for a in _training_state.actors):
165 |                         actor_infos = ray.state.actors()
166 |                         actor_nodes = []
167 |                         for a in _training_state.actors:
168 |                             actor_info = actor_infos.get(a._actor_id.hex())
169 |                             actor_node = actor_info["Address"]["NodeID"]
170 |                             actor_nodes.append(actor_node)
171 |                         assert actor_nodes[0] == actor_nodes[1]
172 | 
173 |             def train_func(params, x, y, ray_params):
174 |                 def inner_func(config):
175 |                     with patch("xgboost_ray.main._train", _mock_train):
176 |                         train(
177 |                             params,
178 |                             RayDMatrix(x, y),
179 |                             num_boost_round=4,
180 |                             ray_params=ray_params,
181 |                         )
182 | 
183 |                 return inner_func
184 | 
185 |             tune.run(
186 |                 train_func(self.params, self.x, self.y, ray_params),
187 |                 resources_per_trial=ray_params.get_tune_resources(),
188 |                 num_samples=1,
189 |             )
190 | 
191 |     def test_timeout(self):
192 |         """Checks that an error occurs when placement group setup times out."""
193 |         os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "5"
194 | 
195 |         with self.ray_start_cluster() as cluster:
196 |             ray.init(address=cluster.address)
197 | 
198 |             with self.assertRaises(TimeoutError):
199 |                 train(
200 |                     self.params,
201 |                     RayDMatrix(self.x, self.y),
202 |                     num_boost_round=2,
203 |                     ray_params=RayParams(
204 |                         max_actor_restarts=1,
205 |                         num_actors=2,
206 |                         resources_per_actor={"invalid": 1},
207 |                     ),
208 |                 )
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     import sys
213 | 
214 |     import pytest  # noqa: F811
215 | 
216 |     sys.exit(pytest.main(["-v", __file__]))
217 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/test_sklearn_matrix.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import ray
  5 | import xgboost as xgb
  6 | from packaging.version import Version
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix
 10 | from xgboost_ray.sklearn import RayXGBClassifier, RayXGBRegressor
 11 | 
 12 | has_label_encoder = XGBOOST_VERSION >= Version("1.0.0") and XGBOOST_VERSION < Version(
 13 |     "1.6.0"
 14 | )
 15 | 
 16 | 
 17 | class XGBoostRaySklearnMatrixTest(unittest.TestCase):
 18 |     def setUp(self):
 19 |         self.seed = 1994
 20 |         self.rng = np.random.RandomState(self.seed)
 21 |         self.params = {"n_estimators": 10}
 22 | 
 23 |     def tearDown(self) -> None:
 24 |         if ray.is_initialized():
 25 |             ray.shutdown()
 26 | 
 27 |     def _init_ray(self):
 28 |         if not ray.is_initialized():
 29 |             ray.init(num_cpus=4)
 30 | 
 31 |     @unittest.skipIf(
 32 |         has_label_encoder, f"not supported in xgb version {xgb.__version__}"
 33 |     )
 34 |     def testClassifierNoLabelEncoder(self, n_class=2):
 35 |         self._init_ray()
 36 | 
 37 |         from sklearn.datasets import load_digits
 38 | 
 39 |         digits = load_digits(n_class=n_class)
 40 |         y = digits["target"]
 41 |         X = digits["data"]
 42 | 
 43 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
 44 | 
 45 |         train_matrix = RayDMatrix(X_train, y_train)
 46 |         test_matrix = RayDMatrix(X_test, y_test)
 47 | 
 48 |         with self.assertRaisesRegex(Exception, "num_class"):
 49 |             RayXGBClassifier(**self.params).fit(train_matrix, None)
 50 | 
 51 |         with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
 52 |             RayXGBClassifier(**self.params).fit(
 53 |                 train_matrix, None, eval_set=[(X_test, y_test)]
 54 |             )
 55 | 
 56 |         with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"):
 57 |             RayXGBClassifier(**self.params).fit(
 58 |                 X_train, y_train, eval_set=[(test_matrix, "eval")]
 59 |             )
 60 | 
 61 |         RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None)
 62 | 
 63 |         clf = RayXGBClassifier(num_class=n_class, **self.params).fit(
 64 |             train_matrix, None, eval_set=[(test_matrix, "eval")]
 65 |         )
 66 | 
 67 |         clf.predict(test_matrix)
 68 |         clf.predict_proba(test_matrix)
 69 | 
 70 |     @unittest.skipIf(
 71 |         has_label_encoder, f"not supported in xgb version {xgb.__version__}"
 72 |     )
 73 |     def testClassifierMulticlassNoLabelEncoder(self):
 74 |         self.testClassifierNoLabelEncoder(n_class=3)
 75 | 
 76 |     def testRegressor(self):
 77 |         self._init_ray()
 78 | 
 79 |         from sklearn.datasets import fetch_california_housing
 80 | 
 81 |         ds = fetch_california_housing()
 82 |         y = ds["target"]
 83 |         X = ds["data"]
 84 | 
 85 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
 86 | 
 87 |         train_matrix = RayDMatrix(X_train, y_train)
 88 |         test_matrix = RayDMatrix(X_test, y_test)
 89 | 
 90 |         with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
 91 |             RayXGBRegressor(**self.params).fit(
 92 |                 train_matrix, None, eval_set=[(X_test, y_test)]
 93 |             )
 94 | 
 95 |         with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"):
 96 |             RayXGBRegressor(**self.params).fit(
 97 |                 X_train, y_train, eval_set=[(test_matrix, "eval")]
 98 |             )
 99 | 
100 |         RayXGBRegressor(**self.params).fit(train_matrix, None)
101 | 
102 |         reg = RayXGBRegressor(**self.params).fit(
103 |             train_matrix, None, eval_set=[(test_matrix, "eval")]
104 |         )
105 | 
106 |         reg.predict(test_matrix)
107 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/test_tune.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | import unittest
  5 | from unittest.mock import MagicMock, patch
  6 | 
  7 | import numpy as np
  8 | import ray
  9 | from ray import tune
 10 | from ray.tune import TuneError
 11 | from ray.tune.integration.xgboost import (
 12 |     TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback,
 13 | )
 14 | 
 15 | from xgboost_ray import RayDMatrix, RayParams, train
 16 | from xgboost_ray.tune import TuneReportCheckpointCallback, _try_add_tune_callback
 17 | 
 18 | 
 19 | class XGBoostRayTuneTest(unittest.TestCase):
 20 |     def setUp(self):
 21 |         ray.init(num_cpus=4)
 22 |         repeat = 8  # Repeat data a couple of times for stability
 23 |         x = np.array(
 24 |             [
 25 |                 [1, 0, 0, 0],  # Feature 0 -> Label 0
 26 |                 [0, 1, 0, 0],  # Feature 1 -> Label 1
 27 |                 [0, 0, 1, 1],  # Feature 2+3 -> Label 2
 28 |                 [0, 0, 1, 0],  # Feature 2+!3 -> Label 3
 29 |             ]
 30 |             * repeat
 31 |         )
 32 |         y = np.array([0, 1, 2, 3] * repeat)
 33 | 
 34 |         self.params = {
 35 |             "xgb": {
 36 |                 "booster": "gbtree",
 37 |                 "nthread": 1,
 38 |                 "max_depth": 2,
 39 |                 "objective": "multi:softmax",
 40 |                 "num_class": 4,
 41 |                 "eval_metric": ["mlogloss", "merror"],
 42 |             },
 43 |             "num_boost_round": tune.choice([1, 3]),
 44 |         }
 45 | 
 46 |         def train_func(
 47 |             ray_params, callbacks=None, check_for_spread_strategy=False, **kwargs
 48 |         ):
 49 |             def _inner_train(config):
 50 |                 if check_for_spread_strategy:
 51 |                     assert (
 52 |                         ray.train.get_context().get_trial_resources().strategy
 53 |                         == "SPREAD"
 54 |                     )
 55 |                 train_set = RayDMatrix(x, y)
 56 |                 train(
 57 |                     config["xgb"],
 58 |                     dtrain=train_set,
 59 |                     ray_params=ray_params,
 60 |                     num_boost_round=config["num_boost_round"],
 61 |                     evals=[(train_set, "train")],
 62 |                     callbacks=callbacks,
 63 |                     **kwargs
 64 |                 )
 65 | 
 66 |             return _inner_train
 67 | 
 68 |         self.train_func = train_func
 69 |         self.experiment_dir = tempfile.mkdtemp()
 70 | 
 71 |     def tearDown(self):
 72 |         ray.shutdown()
 73 |         shutil.rmtree(self.experiment_dir)
 74 | 
 75 |     # noinspection PyTypeChecker
 76 |     @patch.dict(os.environ, {"TUNE_RESULT_DELIM": "/"})
 77 |     def testNumIters(self):
 78 |         """Test that the number of reported tune results is correct"""
 79 |         ray_params = RayParams(cpus_per_actor=1, num_actors=2)
 80 |         params = self.params.copy()
 81 |         params["num_boost_round"] = tune.grid_search([1, 3])
 82 | 
 83 |         # TODO(justinvyu): Remove this once the xgboost integration
 84 |         # has been updated on the Ray side.
 85 |         try:
 86 |             callback = TuneReportCheckpointCallback(
 87 |                 frequency=1, checkpoint_at_end=False
 88 |             )
 89 |         except TypeError:
 90 |             callback = TuneReportCheckpointCallback(frequency=1)
 91 | 
 92 |         analysis = tune.run(
 93 |             self.train_func(ray_params, callbacks=[callback]),
 94 |             config=self.params,
 95 |             resources_per_trial=ray_params.get_tune_resources(),
 96 |             num_samples=1,
 97 |         )
 98 | 
 99 |         self.assertSequenceEqual(
100 |             list(analysis.results_df["training_iteration"]),
101 |             list(analysis.results_df["config/num_boost_round"]),
102 |         )
103 | 
104 |     def testNumItersClient(self):
105 |         """Test ray client mode"""
106 |         if ray.__version__ <= "1.2.0":
107 |             self.skipTest("Ray client mocks do not work in Ray <= 1.2.0")
108 | 
109 |         from ray.util.client.ray_client_helpers import ray_start_client_server
110 | 
111 |         self.assertFalse(ray.util.client.ray.is_connected())
112 |         with ray_start_client_server():
113 |             self.assertTrue(ray.util.client.ray.is_connected())
114 |             self.testNumIters()
115 | 
116 |     def testPlacementOptions(self):
117 |         ray_params = RayParams(
118 |             cpus_per_actor=1, num_actors=1, placement_options={"strategy": "SPREAD"}
119 |         )
120 |         tune.run(
121 |             self.train_func(ray_params, check_for_spread_strategy=True),
122 |             config=self.params,
123 |             resources_per_trial=ray_params.get_tune_resources(),
124 |             num_samples=1,
125 |         )
126 | 
127 |     def testElasticFails(self):
128 |         """Test if error is thrown when using Tune with elastic training."""
129 |         ray_params = RayParams(cpus_per_actor=1, num_actors=1, elastic_training=True)
130 |         with self.assertRaises(TuneError):
131 |             tune.run(
132 |                 self.train_func(ray_params),
133 |                 config=self.params,
134 |                 resources_per_trial=ray_params.get_tune_resources(),
135 |                 num_samples=1,
136 |             )
137 | 
138 |     def testReplaceTuneCheckpoints(self):
139 |         """Test if ray.tune.integration.xgboost callbacks are replaced"""
140 |         # Report and checkpointing callback
141 |         in_cp = [OrigTuneReportCheckpointCallback(metrics="met")]
142 |         in_dict = {"callbacks": in_cp}
143 | 
144 |         with patch("ray.train.get_context") as mocked:
145 |             mocked.return_value = MagicMock(return_value=True)
146 |             _try_add_tune_callback(in_dict)
147 | 
148 |         replaced = in_dict["callbacks"][0]
149 |         self.assertTrue(isinstance(replaced, TuneReportCheckpointCallback))
150 | 
151 |         self.assertSequenceEqual(replaced._metrics, ["met"])
152 | 
153 |     def testEndToEndCheckpointing(self):
154 |         ray_params = RayParams(cpus_per_actor=1, num_actors=2)
155 |         analysis = tune.run(
156 |             self.train_func(
157 |                 ray_params, callbacks=[TuneReportCheckpointCallback(frequency=1)]
158 |             ),
159 |             config=self.params,
160 |             resources_per_trial=ray_params.get_tune_resources(),
161 |             num_samples=1,
162 |             metric="train-mlogloss",
163 |             mode="min",
164 |             log_to_file=True,
165 |             local_dir=self.experiment_dir,
166 |         )
167 | 
168 |         self.assertTrue(os.path.exists(analysis.best_checkpoint.path))
169 | 
170 |     def testEndToEndCheckpointingOrigTune(self):
171 |         ray_params = RayParams(cpus_per_actor=1, num_actors=2)
172 |         analysis = tune.run(
173 |             self.train_func(
174 |                 ray_params, callbacks=[OrigTuneReportCheckpointCallback(frequency=1)]
175 |             ),
176 |             config=self.params,
177 |             resources_per_trial=ray_params.get_tune_resources(),
178 |             num_samples=1,
179 |             metric="train-mlogloss",
180 |             mode="min",
181 |             local_dir=self.experiment_dir,
182 |         )
183 | 
184 |         self.assertTrue(os.path.exists(analysis.best_checkpoint.path))
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     import sys
189 | 
190 |     import pytest
191 | 
192 |     sys.exit(pytest.main(["-v", __file__]))
193 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/test_xgboost_api.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from typing import Tuple
  3 | 
  4 | import numpy as np
  5 | import ray
  6 | import xgboost as xgb
  7 | 
  8 | from xgboost_ray import RayDMatrix, RayParams, train
  9 | from xgboost_ray.compat import TrainingCallback
 10 | 
 11 | # From XGBoost documentation:
 12 | # https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html
 13 | from xgboost_ray.session import get_actor_rank, put_queue
 14 | 
 15 | 
 16 | def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
 17 |     y = dtrain.get_label()
 18 |     return (np.log1p(predt) - np.log1p(y)) / (predt + 1)
 19 | 
 20 | 
 21 | def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
 22 |     y = dtrain.get_label()
 23 |     return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2)
 24 | 
 25 | 
 26 | def squared_log(
 27 |     predt: np.ndarray, dtrain: xgb.DMatrix
 28 | ) -> Tuple[np.ndarray, np.ndarray]:
 29 |     predt[predt < -1] = -1 + 1e-6
 30 |     grad = gradient(predt, dtrain)
 31 |     hess = hessian(predt, dtrain)
 32 |     return grad, hess
 33 | 
 34 | 
 35 | def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
 36 |     y = dtrain.get_label()
 37 |     predt[predt < -1] = -1 + 1e-6
 38 |     elements = np.power(np.log1p(y) - np.log1p(predt), 2)
 39 |     return "PyRMSLE", float(np.sqrt(np.sum(elements) / len(y)))
 40 | 
 41 | 
 42 | class XGBoostAPITest(unittest.TestCase):
 43 |     """This test suite validates core XGBoost API functionality."""
 44 | 
 45 |     def setUp(self):
 46 |         repeat = 8  # Repeat data a couple of times for stability
 47 |         self.x = np.array(
 48 |             [
 49 |                 [1, 0, 0, 0],  # Feature 0 -> Label 0
 50 |                 [0, 1, 0, 0],  # Feature 1 -> Label 1
 51 |                 [0, 0, 1, 1],  # Feature 2+3 -> Label 0
 52 |                 [0, 0, 1, 0],  # Feature 2+!3 -> Label 1
 53 |             ]
 54 |             * repeat
 55 |         )
 56 |         self.y = np.array([0, 1, 0, 1] * repeat)
 57 | 
 58 |         self.params = {
 59 |             "booster": "gbtree",
 60 |             "tree_method": "hist",
 61 |             "nthread": 1,
 62 |             "max_depth": 2,
 63 |             "objective": "binary:logistic",
 64 |             "seed": 1000,
 65 |         }
 66 | 
 67 |         self.kwargs = {}
 68 | 
 69 |     def tearDown(self) -> None:
 70 |         if ray.is_initialized():
 71 |             ray.shutdown()
 72 | 
 73 |     def _init_ray(self):
 74 |         if not ray.is_initialized():
 75 |             ray.init(num_cpus=4)
 76 | 
 77 |     def testCustomObjectiveFunction(self):
 78 |         """Ensure that custom objective functions work.
 79 | 
 80 |         Runs a custom objective function with pure XGBoost and
 81 |         XGBoost on Ray and compares the prediction outputs."""
 82 |         self._init_ray()
 83 | 
 84 |         params = self.params.copy()
 85 |         params.pop("objective", None)
 86 | 
 87 |         bst_xgb = xgb.train(params, xgb.DMatrix(self.x, self.y), obj=squared_log)
 88 | 
 89 |         bst_ray = train(
 90 |             params,
 91 |             RayDMatrix(self.x, self.y),
 92 |             ray_params=RayParams(num_actors=2),
 93 |             obj=squared_log,
 94 |             **self.kwargs,
 95 |         )
 96 | 
 97 |         x_mat = xgb.DMatrix(self.x)
 98 |         pred_y_xgb = np.round(bst_xgb.predict(x_mat))
 99 |         pred_y_ray = np.round(bst_ray.predict(x_mat))
100 | 
101 |         self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray))
102 |         self.assertSequenceEqual(list(self.y), list(pred_y_ray))
103 | 
104 |     def testCustomMetricFunction(self):
105 |         """Ensure that custom objective functions work.
106 | 
107 |         Runs a custom objective function with pure XGBoost and
108 |         XGBoost on Ray and compares the prediction outputs."""
109 |         self._init_ray()
110 | 
111 |         params = self.params.copy()
112 |         params.pop("objective", None)
113 |         params["disable_default_eval_metric"] = 1
114 | 
115 |         dtrain_xgb = xgb.DMatrix(self.x, self.y)
116 |         evals_result_xgb = {}
117 |         bst_xgb = xgb.train(
118 |             params,
119 |             dtrain_xgb,
120 |             obj=squared_log,
121 |             feval=rmsle,
122 |             evals=[(dtrain_xgb, "dtrain")],
123 |             evals_result=evals_result_xgb,
124 |         )
125 | 
126 |         dtrain_ray = RayDMatrix(self.x, self.y)
127 |         evals_result_ray = {}
128 |         bst_ray = train(
129 |             params,
130 |             dtrain_ray,
131 |             ray_params=RayParams(num_actors=2),
132 |             obj=squared_log,
133 |             feval=rmsle,
134 |             evals=[(dtrain_ray, "dtrain")],
135 |             evals_result=evals_result_ray,
136 |             **self.kwargs,
137 |         )
138 | 
139 |         x_mat = xgb.DMatrix(self.x)
140 |         pred_y_xgb = np.round(bst_xgb.predict(x_mat))
141 |         pred_y_ray = np.round(bst_ray.predict(x_mat))
142 | 
143 |         self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray))
144 |         self.assertSequenceEqual(list(self.y), list(pred_y_ray))
145 | 
146 |         self.assertTrue(
147 |             np.allclose(
148 |                 evals_result_xgb["dtrain"]["PyRMSLE"],
149 |                 evals_result_ray["dtrain"]["PyRMSLE"],
150 |                 atol=0.1,
151 |             )
152 |         )
153 | 
154 |     def testCallbacks(self):
155 |         class _Callback(TrainingCallback):
156 |             def after_iteration(self, model, epoch, evals_log):
157 |                 print(f"My rank: {get_actor_rank()}")
158 |                 put_queue(("rank", get_actor_rank()))
159 | 
160 |         callback = _Callback()
161 | 
162 |         additional_results = {}
163 |         train(
164 |             self.params,
165 |             RayDMatrix(self.x, self.y),
166 |             ray_params=RayParams(num_actors=2),
167 |             callbacks=[callback],
168 |             additional_results=additional_results,
169 |             **self.kwargs,
170 |         )
171 | 
172 |         self.assertEqual(len(additional_results["callback_returns"]), 2)
173 |         self.assertTrue(
174 |             all(rank == 0 for (_, rank) in additional_results["callback_returns"][0])
175 |         )
176 |         self.assertTrue(
177 |             all(rank == 1 for (_, rank) in additional_results["callback_returns"][1])
178 |         )
179 | 
180 | 
181 | if __name__ == "__main__":
182 |     import sys
183 | 
184 |     import pytest
185 | 
186 |     sys.exit(pytest.main(["-v", __file__]))
187 | 


--------------------------------------------------------------------------------
/xgboost_ray/tests/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import tempfile
  4 | import time
  5 | from typing import Dict, List, Optional, Tuple, Union
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import xgboost as xgb
 10 | 
 11 | from xgboost_ray.compat import TrainingCallback
 12 | from xgboost_ray.session import get_actor_rank, put_queue
 13 | 
 14 | 
 15 | def get_num_trees(bst: xgb.Booster):
 16 |     import json
 17 | 
 18 |     data = [json.loads(d) for d in bst.get_dump(dump_format="json")]
 19 |     return len(data) // 4
 20 | 
 21 | 
 22 | def create_data(num_rows: int, num_cols: int, dtype: np.dtype = np.float32):
 23 | 
 24 |     return pd.DataFrame(
 25 |         np.random.uniform(0.0, 10.0, size=(num_rows, num_cols)),
 26 |         columns=[f"feature_{i}" for i in range(num_cols)],
 27 |         dtype=dtype,
 28 |     )
 29 | 
 30 | 
 31 | def create_labels(
 32 |     num_rows: int, num_classes: int = 2, dtype: Optional[np.dtype] = None
 33 | ):
 34 |     if num_classes == 0:
 35 |         # Create regression label
 36 |         dtype = dtype or np.float32
 37 |         return pd.Series(
 38 |             np.random.uniform(0, 1, size=num_rows), dtype=dtype, name="label"
 39 |         )
 40 | 
 41 |     dtype = dtype or np.int32
 42 |     return pd.Series(
 43 |         np.random.randint(0, num_classes, size=num_rows), dtype=dtype, name="label"
 44 |     )
 45 | 
 46 | 
 47 | def create_parquet(
 48 |     filename: str,
 49 |     num_rows: int,
 50 |     num_features: int,
 51 |     num_classes: int = 2,
 52 |     num_partitions: int = 1,
 53 | ):
 54 | 
 55 |     partition_rows = num_rows // num_partitions
 56 |     for partition in range(num_partitions):
 57 |         print(f"Creating partition {partition}")
 58 |         data = create_data(partition_rows, num_features)
 59 |         labels = create_labels(partition_rows, num_classes)
 60 |         partition = pd.Series(np.full(partition_rows, partition), dtype=np.int32)
 61 | 
 62 |         data["labels"] = labels
 63 |         data["partition"] = partition
 64 | 
 65 |         os.makedirs(filename, 0o755, exist_ok=True)
 66 |         data.to_parquet(
 67 |             filename,
 68 |             partition_cols=["partition"],
 69 |             engine="pyarrow",
 70 |             partition_filename_cb=lambda key: f"part_{key[0]}.parquet",
 71 |         )
 72 | 
 73 | 
 74 | def create_parquet_in_tempdir(
 75 |     filename: str,
 76 |     num_rows: int,
 77 |     num_features: int,
 78 |     num_classes: int = 2,
 79 |     num_partitions: int = 1,
 80 | ) -> Tuple[str, str]:
 81 |     temp_dir = tempfile.mkdtemp()
 82 |     path = os.path.join(temp_dir, filename)
 83 |     create_parquet(
 84 |         path,
 85 |         num_rows=num_rows,
 86 |         num_features=num_features,
 87 |         num_classes=num_classes,
 88 |         num_partitions=num_partitions,
 89 |     )
 90 |     return temp_dir, path
 91 | 
 92 | 
 93 | def flatten_obj(obj: Union[List, Dict], keys=None, base=None):
 94 |     keys = keys or []
 95 |     base = base if base is not None else {}  # Keep same object if empty dict
 96 |     if isinstance(obj, list):
 97 |         for i, o in enumerate(obj):
 98 |             flatten_obj(o, keys + [str(i)], base)
 99 |     elif isinstance(obj, dict):
100 |         for k, o in obj.items():
101 |             flatten_obj(o, keys + [str(k)], base)
102 |     else:
103 |         base["/".join(keys)] = obj
104 |     return base
105 | 
106 | 
107 | def tree_obj(bst: xgb.Booster):
108 |     return [json.loads(j) for j in bst.get_dump(dump_format="json")]
109 | 
110 | 
111 | def _kill_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6):
112 |     """Returns a callback to kill an actor process.
113 | 
114 |     Args:
115 |         die_lock_file: A file lock used to prevent race conditions
116 |             when killing the actor.
117 |         actor_rank: The rank of the actor to kill.
118 |         fail_iteration: The iteration after which the actor is killed.
119 | 
120 |     """
121 | 
122 |     class _KillCallback(TrainingCallback):
123 |         def after_iteration(self, model, epoch, evals_log):
124 |             if get_actor_rank() == actor_rank:
125 |                 put_queue((epoch, time.time()))
126 |             if (
127 |                 get_actor_rank() == actor_rank
128 |                 and epoch == fail_iteration
129 |                 and not os.path.exists(die_lock_file)
130 |             ):
131 | 
132 |                 # Get PID
133 |                 pid = os.getpid()
134 |                 print(f"Killing process: {pid}")
135 |                 with open(die_lock_file, "wt") as fp:
136 |                     fp.write("")
137 | 
138 |                 time.sleep(2)
139 |                 print(f"Testing: Rank {get_actor_rank()} will now die.")
140 |                 os.kill(pid, 9)
141 | 
142 |     return _KillCallback()
143 | 
144 | 
145 | def _fail_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6):
146 |     """Returns a callback to cause an Xgboost actor to fail training.
147 | 
148 |     Args:
149 |         die_lock_file: A file lock used to prevent race conditions
150 |             when causing the actor to fail.
151 |         actor_rank: The rank of the actor to fail.
152 |         fail_iteration: The iteration after which the training for
153 |             the specified actor fails.
154 | 
155 |     """
156 | 
157 |     class _FailCallback(TrainingCallback):
158 |         def after_iteration(self, model, epoch, evals_log):
159 | 
160 |             if get_actor_rank() == actor_rank:
161 |                 put_queue((epoch, time.time()))
162 |             if (
163 |                 get_actor_rank() == actor_rank
164 |                 and epoch == fail_iteration
165 |                 and not os.path.exists(die_lock_file)
166 |             ):
167 | 
168 |                 with open(die_lock_file, "wt") as fp:
169 |                     fp.write("")
170 |                 time.sleep(2)
171 |                 import sys
172 | 
173 |                 print(f"Testing: Rank {get_actor_rank()} will now fail.")
174 |                 sys.exit(1)
175 | 
176 |     return _FailCallback()
177 | 
178 | 
179 | def _checkpoint_callback(frequency: int = 1, before_iteration_=False):
180 |     """Returns a callback to checkpoint a model.
181 | 
182 |     Args:
183 |         frequency: The interval at which checkpointing occurs. If
184 |             frequency is set to n, checkpointing occurs every n epochs.
185 |         before_iteration_: If True, checkpoint before the iteration
186 |             begins. Else, checkpoint after the iteration ends.
187 | 
188 |     """
189 | 
190 |     class _CheckpointCallback(TrainingCallback):
191 |         def after_iteration(self, model, epoch, evals_log):
192 |             if epoch % frequency == 0:
193 |                 put_queue(model.save_raw())
194 | 
195 |     if before_iteration_:
196 | 
197 |         def _before_iteration(self, model, epoch, evals_log):
198 |             self.after_iteration(model, epoch, evals_log)
199 | 
200 |         _CheckpointCallback.before_iteration = _before_iteration
201 | 
202 |     return _CheckpointCallback()
203 | 
204 | 
205 | def _sleep_callback(sleep_iteration: int = 6, sleep_seconds: int = 5):
206 |     """Returns a callback to sleep after an iteration.
207 | 
208 |     This artificially inflates training time.
209 | 
210 |     Args:
211 |         sleep_iteration: The iteration after which the actor should
212 |             sleep.
213 |         sleep_seconds: Time in seconds the actor should sleep.
214 | 
215 |     """
216 | 
217 |     class _SleepCallback(TrainingCallback):
218 |         def after_iteration(self, model, epoch, evals_log):
219 |             if epoch == sleep_iteration:
220 |                 print(
221 |                     f"Testing: Rank {get_actor_rank()} will now sleep "
222 |                     f"for {sleep_seconds} seconds."
223 |                 )
224 |                 time.sleep(sleep_seconds)
225 | 
226 |     return _SleepCallback()
227 | 


--------------------------------------------------------------------------------
/xgboost_ray/tune.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict, Optional
  3 | 
  4 | import ray
  5 | from ray.util.annotations import PublicAPI
  6 | 
  7 | from xgboost_ray.session import get_rabit_rank, put_queue
  8 | from xgboost_ray.util import force_on_current_node
  9 | from xgboost_ray.xgb import xgboost as xgb
 10 | 
 11 | try:
 12 |     from ray import train, tune  # noqa: F401
 13 | except (ImportError, ModuleNotFoundError) as e:
 14 |     raise RuntimeError(
 15 |         "Ray Train and Ray Tune are required dependencies of xgboost_ray. "
 16 |         'Please install with: `pip install "ray[train]"`'
 17 |     ) from e
 18 | 
 19 | import ray.train
 20 | from ray.tune.integration.xgboost import TuneReportCallback as OrigTuneReportCallback
 21 | from ray.tune.integration.xgboost import (
 22 |     TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback,
 23 | )
 24 | 
 25 | 
 26 | class TuneReportCheckpointCallback(OrigTuneReportCheckpointCallback):
 27 |     def after_iteration(self, model, epoch: int, evals_log: Dict):
 28 |         # NOTE: We need to update `evals_log` here (even though the super method
 29 |         # already does it) because the actual callback method gets run
 30 |         # in a different process, so *this* instance of the callback will not have
 31 |         # access to the `evals_log` dict in `after_training`.
 32 |         self._evals_log = evals_log
 33 | 
 34 |         if get_rabit_rank() == 0:
 35 |             put_queue(
 36 |                 lambda: super(TuneReportCheckpointCallback, self).after_iteration(
 37 |                     model=model, epoch=epoch, evals_log=evals_log
 38 |                 )
 39 |             )
 40 | 
 41 |     def after_training(self, model):
 42 |         if get_rabit_rank() == 0:
 43 |             put_queue(
 44 |                 lambda: super(TuneReportCheckpointCallback, self).after_training(
 45 |                     model=model
 46 |                 )
 47 |             )
 48 |         return model
 49 | 
 50 | 
 51 | class TuneReportCallback(OrigTuneReportCallback):
 52 |     def __new__(cls: type, *args, **kwargs):
 53 |         # TODO(justinvyu): [code_removal] Remove in Ray 2.11.
 54 |         raise DeprecationWarning(
 55 |             "`TuneReportCallback` is deprecated. "
 56 |             "Use `xgboost_ray.tune.TuneReportCheckpointCallback` instead."
 57 |         )
 58 | 
 59 | 
 60 | def _try_add_tune_callback(kwargs: Dict):
 61 |     ray_train_context_initialized = (
 62 |         ray.train.get_context().get_trial_resources() is not None
 63 |     )
 64 |     if ray_train_context_initialized:
 65 |         callbacks = kwargs.get("callbacks", []) or []
 66 |         new_callbacks = []
 67 |         has_tune_callback = False
 68 | 
 69 |         REPLACE_MSG = (
 70 |             "Replaced `{orig}` with `{target}`. If you want to "
 71 |             "avoid this warning, pass `{target}` as a callback "
 72 |             "directly in your calls to `xgboost_ray.train()`."
 73 |         )
 74 | 
 75 |         for cb in callbacks:
 76 |             if isinstance(cb, TuneReportCheckpointCallback):
 77 |                 has_tune_callback = True
 78 |                 new_callbacks.append(cb)
 79 |             elif isinstance(cb, OrigTuneReportCheckpointCallback):
 80 |                 orig_metrics = cb._metrics
 81 |                 orig_frequency = cb._frequency
 82 | 
 83 |                 replace_cb = TuneReportCheckpointCallback(
 84 |                     metrics=orig_metrics, frequency=orig_frequency
 85 |                 )
 86 |                 new_callbacks.append(replace_cb)
 87 |                 logging.warning(
 88 |                     REPLACE_MSG.format(
 89 |                         orig="ray.tune.integration.xgboost."
 90 |                         "TuneReportCheckpointCallback",
 91 |                         target="xgboost_ray.tune.TuneReportCheckpointCallback",
 92 |                     )
 93 |                 )
 94 |                 has_tune_callback = True
 95 |             else:
 96 |                 new_callbacks.append(cb)
 97 | 
 98 |         if not has_tune_callback:
 99 |             new_callbacks.append(TuneReportCheckpointCallback(frequency=0))
100 | 
101 |         kwargs["callbacks"] = new_callbacks
102 |         return True
103 |     else:
104 |         return False
105 | 
106 | 
107 | def _get_tune_resources(
108 |     num_actors: int,
109 |     cpus_per_actor: int,
110 |     gpus_per_actor: int,
111 |     resources_per_actor: Optional[Dict],
112 |     placement_options: Optional[Dict],
113 | ):
114 |     """Returns object to use for ``resources_per_trial`` with Ray Tune."""
115 |     from ray.tune import PlacementGroupFactory
116 | 
117 |     head_bundle = {}
118 |     child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor}
119 |     child_bundle_extra = {} if resources_per_actor is None else resources_per_actor
120 |     child_bundles = [{**child_bundle, **child_bundle_extra} for _ in range(num_actors)]
121 |     bundles = [head_bundle] + child_bundles
122 |     placement_options = placement_options or {}
123 |     placement_options.setdefault("strategy", "PACK")
124 |     placement_group_factory = PlacementGroupFactory(bundles, **placement_options)
125 | 
126 |     return placement_group_factory
127 | 
128 | 
129 | @PublicAPI(stability="beta")
130 | def load_model(model_path):
131 |     """Loads the model stored in the provided model_path.
132 | 
133 |     If using Ray Client, this will automatically handle loading the path on
134 |     the server by using a Ray task.
135 | 
136 |     Returns:
137 |         xgb.Booster object of the model stored in the provided model_path
138 | 
139 |     """
140 | 
141 |     def load_model_fn(model_path):
142 |         best_bst = xgb.Booster()
143 |         best_bst.load_model(model_path)
144 |         return best_bst
145 | 
146 |     # Load the model checkpoint.
147 |     if ray.util.client.ray.is_connected():
148 |         # If using Ray Client, the best model is saved on the server.
149 |         # So we have to wrap the model loading in a ray task.
150 |         remote_load = ray.remote(load_model_fn)
151 |         remote_load = force_on_current_node(remote_load)
152 |         bst = ray.get(remote_load.remote(model_path))
153 |     else:
154 |         bst = load_model_fn(model_path)
155 | 
156 |     return bst
157 | 


--------------------------------------------------------------------------------
/xgboost_ray/util.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from typing import Dict, List, Optional
  3 | 
  4 | import ray
  5 | from ray.util.annotations import DeveloperAPI
  6 | 
  7 | 
  8 | @DeveloperAPI
  9 | class Unavailable:
 10 |     """No object should be instance of this class"""
 11 | 
 12 |     def __init__(self):
 13 |         raise RuntimeError("This class should never be instantiated.")
 14 | 
 15 | 
 16 | class _EventActor:
 17 |     def __init__(self):
 18 |         self._event = asyncio.Event()
 19 | 
 20 |     def set(self):
 21 |         self._event.set()
 22 | 
 23 |     def clear(self):
 24 |         self._event.clear()
 25 | 
 26 |     def is_set(self):
 27 |         return self._event.is_set()
 28 | 
 29 | 
 30 | @DeveloperAPI
 31 | class Event:
 32 |     def __init__(self, actor_options: Optional[Dict] = None):
 33 |         actor_options = {} if not actor_options else actor_options
 34 |         self.actor = ray.remote(_EventActor).options(**actor_options).remote()
 35 | 
 36 |     def set(self):
 37 |         self.actor.set.remote()
 38 | 
 39 |     def clear(self):
 40 |         self.actor.clear.remote()
 41 | 
 42 |     def is_set(self):
 43 |         return ray.get(self.actor.is_set.remote())
 44 | 
 45 |     def shutdown(self):
 46 |         if self.actor:
 47 |             ray.kill(self.actor)
 48 |         self.actor = None
 49 | 
 50 | 
 51 | @DeveloperAPI
 52 | class MultiActorTask:
 53 |     """Utility class to hold multiple futures.
 54 | 
 55 |     The `is_ready()` method will return True once all futures are ready.
 56 | 
 57 |     Args:
 58 |         pending_futures: List of object references (futures)
 59 |             that should be tracked.
 60 |     """
 61 | 
 62 |     def __init__(self, pending_futures: Optional[List[ray.ObjectRef]] = None):
 63 |         self._pending_futures = pending_futures or []
 64 |         self._ready_futures = []
 65 | 
 66 |     def is_ready(self):
 67 |         if not self._pending_futures:
 68 |             return True
 69 | 
 70 |         ready = True
 71 |         while ready:
 72 |             ready, not_ready = ray.wait(self._pending_futures, timeout=0)
 73 |             if ready:
 74 |                 for obj in ready:
 75 |                     self._pending_futures.remove(obj)
 76 |                     self._ready_futures.append(obj)
 77 | 
 78 |         return not bool(self._pending_futures)
 79 | 
 80 | 
 81 | @DeveloperAPI
 82 | def get_current_node_resource_key() -> str:
 83 |     """Get the Ray resource key for current node.
 84 |     It can be used for actor placement.
 85 |     If using Ray Client, this will return the resource key for the node that
 86 |     is running the client server.
 87 |     """
 88 |     current_node_id = ray.get_runtime_context().get_node_id()
 89 |     for node in ray.nodes():
 90 |         if node["NodeID"] == current_node_id:
 91 |             # Found the node.
 92 |             for key in node["Resources"].keys():
 93 |                 if key.startswith("node:"):
 94 |                     return key
 95 |     else:
 96 |         raise ValueError("Cannot found the node dictionary for current node.")
 97 | 
 98 | 
 99 | @DeveloperAPI
100 | def force_on_current_node(task_or_actor):
101 |     """Given a task or actor, place it on the current node.
102 | 
103 |     If the task or actor that is passed in already has custom resource
104 |     requirements, then they will be overridden.
105 | 
106 |     If using Ray Client, the current node is the client server node.
107 |     """
108 |     node_resource_key = get_current_node_resource_key()
109 |     options = {"resources": {node_resource_key: 0.01}}
110 |     return task_or_actor.options(**options)
111 | 


--------------------------------------------------------------------------------
/xgboost_ray/xgb.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | if TYPE_CHECKING:
 4 |     import xgboost
 5 | else:
 6 |     try:
 7 |         import xgboost
 8 |     except ImportError:
 9 |         xgboost = None
10 | 
11 | __all__ = ["xgboost"]
12 | 


--------------------------------------------------------------------------------