├── .flake8 ├── .github └── workflows │ ├── gpu.yaml │ └── test.yaml ├── .gitignore ├── LICENSE ├── README.md ├── examples ├── format.sh ├── requirements ├── lint-requirements.txt └── test-requirements.txt ├── run_ci_examples.sh ├── run_ci_tests.sh ├── setup.py └── xgboost_ray ├── __init__.py ├── callback.py ├── compat ├── __init__.py └── tracker.py ├── data_sources ├── __init__.py ├── _distributed.py ├── csv.py ├── dask.py ├── data_source.py ├── modin.py ├── numpy.py ├── object_store.py ├── pandas.py ├── parquet.py ├── partitioned.py ├── petastorm.py └── ray_dataset.py ├── elastic.py ├── examples ├── __init__.py ├── create_test_data.py ├── higgs.py ├── higgs_parquet.py ├── readme.py ├── readme_sklearn_api.py ├── simple.py ├── simple_dask.py ├── simple_modin.py ├── simple_objectstore.py ├── simple_partitioned.py ├── simple_predict.py ├── simple_ray_dataset.py ├── simple_tune.py ├── train_on_test_data.py └── train_with_ml_dataset.py ├── main.py ├── matrix.py ├── session.py ├── sklearn.py ├── tests ├── __init__.py ├── conftest.py ├── env_info.sh ├── fault_tolerance.py ├── release │ ├── benchmark_cpu_gpu.py │ ├── benchmark_ft.py │ ├── cluster_cpu.yaml │ ├── cluster_ft.yaml │ ├── cluster_gpu.yaml │ ├── create_learnable_data.py │ ├── create_test_data.py │ ├── custom_objective_metric.py │ ├── run_e2e_gpu.sh │ ├── setup_xgboost.sh │ ├── start_cpu_cluster.sh │ ├── start_ft_cluster.sh │ ├── start_gpu_cluster.sh │ ├── submit_cpu_gpu_benchmark.sh │ ├── submit_ft_benchmark.sh │ ├── tune_cluster.yaml │ └── tune_placement.py ├── test_client.py ├── test_colocation.py ├── test_data_source.py ├── test_end_to_end.py ├── test_fault_tolerance.py ├── test_matrix.py ├── test_sklearn.py ├── test_sklearn_matrix.py ├── test_tune.py ├── test_xgboost_api.py └── utils.py ├── tune.py ├── util.py └── xgb.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | inline-quotes = " 4 | ignore = 5 | C408 6 | C417 7 | E121 8 | E123 9 | E126 10 | E203 11 | E226 12 | E24 13 | E704 14 | W503 15 | W504 16 | W605 17 | I 18 | N 19 | B001 20 | B002 21 | B003 22 | B004 23 | B005 24 | B007 25 | B008 26 | B009 27 | B010 28 | B011 29 | B012 30 | B013 31 | B014 32 | B015 33 | B016 34 | B017 35 | avoid-escape = no 36 | # Error E731 is ignored because of the migration from YAPF to Black. 37 | # See https://github.com/ray-project/ray/issues/21315 for more information. 38 | per-file-ignores = 39 | rllib/evaluation/worker_set.py:E731 40 | rllib/evaluation/sampler.py:E731 41 | -------------------------------------------------------------------------------- /.github/workflows/gpu.yaml: -------------------------------------------------------------------------------- 1 | name: GPU on manual trigger 2 | 3 | on: 4 | workflow_dispatch 5 | 6 | jobs: 7 | test_gpu: 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 20 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 3.8 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: 3.8 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | python -m pip install -U anyscale pyyaml 20 | - name: Print environment info 21 | run: | 22 | ./xgboost_ray/tests/env_info.sh 23 | - name: Set anyscale project 24 | env: 25 | ANYSCALE_PROJECT: ${{ secrets.ANYSCALE_PROJECT }} 26 | run: | 27 | echo "project_id: ${ANYSCALE_PROJECT}" > ./xgboost_ray/tests/release/.anyscale.yaml 28 | - name: Run end to end GPU test 29 | env: 30 | ANYSCALE_CLI_TOKEN: ${{ secrets.ANYSCALE_CLI_TOKEN }} 31 | run: | 32 | pushd ./xgboost_ray/tests/release 33 | ./run_e2e_gpu.sh 34 | popd || true 35 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: pytest on push 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: "0 5 * * *" 8 | 9 | jobs: 10 | test_lint: 11 | runs-on: ubuntu-latest 12 | timeout-minutes: 3 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v3 17 | with: 18 | python-version: 3.8 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | python -m pip install codecov 23 | if [ -f requirements/lint-requirements.txt ]; then python -m pip install -r requirements/lint-requirements.txt; fi 24 | - name: Print environment info 25 | run: | 26 | ./xgboost_ray/tests/env_info.sh 27 | - name: Run format script 28 | run: | 29 | ls -alp 30 | ./format.sh --all 31 | 32 | test_linux_ray_master: 33 | runs-on: ubuntu-latest 34 | timeout-minutes: 160 35 | strategy: 36 | matrix: 37 | python-version: ["3.8", "3.9", "3.10"] 38 | include: 39 | - python-version: "3.8" 40 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl 41 | - python-version: "3.9" 42 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl 43 | - python-version: "3.10" 44 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl 45 | steps: 46 | - uses: actions/checkout@v3 47 | - name: Set up Python ${{ matrix.python-version }} 48 | uses: actions/setup-python@v3 49 | with: 50 | python-version: ${{ matrix.python-version }} 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install codecov 55 | python -m pip install -U ${{ matrix.ray-wheel }} 56 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 57 | - name: Install package 58 | run: | 59 | python -m pip install -e . 60 | - name: Print environment info 61 | run: | 62 | ./xgboost_ray/tests/env_info.sh 63 | - name: Run tests 64 | uses: nick-invision/retry@v2 65 | with: 66 | timeout_minutes: 45 67 | max_attempts: 3 68 | command: bash ./run_ci_tests.sh 69 | - name: Run examples 70 | uses: nick-invision/retry@v2 71 | with: 72 | timeout_minutes: 10 73 | max_attempts: 3 74 | command: bash ./run_ci_examples.sh 75 | 76 | test_linux_ray_release: 77 | runs-on: ubuntu-latest 78 | timeout-minutes: 160 79 | strategy: 80 | matrix: 81 | python-version: ["3.8", "3.9", "3.10"] 82 | steps: 83 | - uses: actions/checkout@v3 84 | - name: Set up Python ${{ matrix.python-version }} 85 | uses: actions/setup-python@v3 86 | with: 87 | python-version: ${{ matrix.python-version }} 88 | - name: Install dependencies 89 | run: | 90 | python -m pip install --upgrade pip 91 | python -m pip install codecov 92 | python -m pip install -U ray 93 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 94 | - name: Install package 95 | run: | 96 | python -m pip install -e . 97 | - name: Print environment info 98 | run: | 99 | ./xgboost_ray/tests/env_info.sh 100 | - name: Run tests 101 | uses: nick-invision/retry@v2 102 | with: 103 | timeout_minutes: 45 104 | max_attempts: 3 105 | command: bash ./run_ci_tests.sh 106 | - name: Run examples 107 | uses: nick-invision/retry@v2 108 | with: 109 | timeout_minutes: 10 110 | max_attempts: 3 111 | command: bash ./run_ci_examples.sh 112 | 113 | test_linux_compat: 114 | # Test compatibility when some optional libraries are missing 115 | # Test runs on latest ray release 116 | runs-on: ubuntu-latest 117 | timeout-minutes: 160 118 | strategy: 119 | matrix: 120 | python-version: ["3.8", "3.9", "3.10"] 121 | steps: 122 | - uses: actions/checkout@v3 123 | - name: Set up Python ${{ matrix.python-version }} 124 | uses: actions/setup-python@v3 125 | with: 126 | python-version: ${{ matrix.python-version }} 127 | - name: Install dependencies 128 | run: | 129 | python -m pip install --upgrade pip 130 | python -m pip install codecov 131 | python -m pip install -U ray 132 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 133 | - name: Uninstall unavailable dependencies 134 | # Disables modin and Ray Tune (via tabulate) 135 | run: | 136 | python -m pip uninstall -y modin 137 | python -m pip uninstall -y tabulate 138 | - name: Install package 139 | run: | 140 | python -m pip install -e . 141 | - name: Print environment info 142 | run: | 143 | ./xgboost_ray/tests/env_info.sh 144 | - name: Run tests 145 | uses: nick-invision/retry@v2 146 | with: 147 | timeout_minutes: 45 148 | max_attempts: 3 149 | command: bash ./run_ci_tests.sh --no-tune 150 | - name: Run examples 151 | uses: nick-invision/retry@v2 152 | with: 153 | timeout_minutes: 10 154 | max_attempts: 3 155 | command: bash ./run_ci_examples.sh --no-tune 156 | 157 | test_linux_cutting_edge: 158 | # Tests on cutting edge, i.e. latest Ray master, latest XGBoost master 159 | runs-on: ubuntu-latest 160 | timeout-minutes: 160 161 | strategy: 162 | matrix: 163 | # no new versions for xgboost are published for 3.6 164 | python-version: ["3.8", "3.9", "3.10"] 165 | include: 166 | - python-version: "3.8" 167 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl 168 | - python-version: "3.9" 169 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl 170 | - python-version: "3.10" 171 | ray-wheel: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl 172 | steps: 173 | - uses: actions/checkout@v3 174 | - name: Set up Python ${{ matrix.python-version }} 175 | uses: actions/setup-python@v3 176 | with: 177 | python-version: ${{ matrix.python-version }} 178 | - name: Install dependencies 179 | run: | 180 | python -m pip install --upgrade pip 181 | python -m pip install codecov 182 | python -m pip install -U ${{ matrix.ray-wheel }} 183 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 184 | - name: Install Ubuntu system dependencies 185 | run: | 186 | sudo apt-get install -y --no-install-recommends ninja-build 187 | - name: Install package 188 | run: | 189 | python -m pip install -e . 190 | - name: Clone XGBoost repo 191 | uses: actions/checkout@v3 192 | with: 193 | repository: dmlc/xgboost 194 | path: xgboost 195 | submodules: true 196 | - name: Install XGBoost from source 197 | shell: bash -l {0} 198 | run: | 199 | pushd ${GITHUB_WORKSPACE}/xgboost/python-package 200 | python --version 201 | python setup.py sdist 202 | pip install -v ./dist/xgboost-*.tar.gz 203 | popd 204 | - name: Print environment info 205 | run: | 206 | ./xgboost_ray/tests/env_info.sh 207 | - name: Run tests 208 | uses: nick-invision/retry@v2 209 | with: 210 | timeout_minutes: 45 211 | max_attempts: 3 212 | command: bash ./run_ci_tests.sh 213 | - name: Run examples 214 | uses: nick-invision/retry@v2 215 | with: 216 | timeout_minutes: 10 217 | max_attempts: 3 218 | command: bash ./run_ci_examples.sh 219 | 220 | test_linux_xgboost_legacy: 221 | # Tests on XGBoost 0.90 and latest Ray release 222 | runs-on: ubuntu-latest 223 | timeout-minutes: 160 224 | strategy: 225 | matrix: 226 | python-version: [3.8] 227 | steps: 228 | - uses: actions/checkout@v3 229 | - name: Set up Python ${{ matrix.python-version }} 230 | uses: actions/setup-python@v3 231 | with: 232 | python-version: ${{ matrix.python-version }} 233 | - name: Install dependencies 234 | run: | 235 | python -m pip install --upgrade pip 236 | python -m pip install codecov 237 | python -m pip install -U ray 238 | if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi 239 | - name: Install package 240 | run: | 241 | python -m pip install -e . 242 | - name: Install legacy XGBoost 243 | run: | 244 | python -m pip install xgboost==0.90 245 | - name: Print environment info 246 | run: | 247 | ./xgboost_ray/tests/env_info.sh 248 | - name: Run tests 249 | uses: nick-invision/retry@v2 250 | with: 251 | timeout_minutes: 45 252 | max_attempts: 3 253 | command: bash ./run_ci_tests.sh 254 | - name: Run examples 255 | uses: nick-invision/retry@v2 256 | with: 257 | timeout_minutes: 10 258 | max_attempts: 3 259 | command: bash ./run_ci_examples.sh 260 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python byte code files 2 | *.pyc 3 | python/.eggs 4 | 5 | # Backup files 6 | *.bak 7 | 8 | # Emacs temporary files 9 | *~ 10 | *# 11 | 12 | # Debug symbols 13 | *.pdb 14 | 15 | # Visual Studio files 16 | /packages 17 | *.suo 18 | *.user 19 | *.VC.db 20 | *.VC.opendb 21 | 22 | # Protobuf-generated files 23 | *_pb2.py 24 | *.pb.h 25 | *.pb.cc 26 | 27 | # Ray cluster configuration 28 | scripts/nodes.txt 29 | 30 | # OS X folder attributes 31 | .DS_Store 32 | 33 | # Debug files 34 | *.dSYM/ 35 | *.su 36 | 37 | # Python setup files 38 | *.egg-info 39 | 40 | # Compressed files 41 | *.gz 42 | 43 | # Datasets from examples 44 | **/MNIST_data/ 45 | **/cifar-10-batches-bin/ 46 | 47 | # Generated documentation files 48 | /doc/_build 49 | /doc/source/_static/thumbs 50 | /doc/source/tune/generated_guides/ 51 | 52 | # User-specific stuff: 53 | .idea/ 54 | 55 | # Pytest Cache 56 | **/.pytest_cache 57 | **/.cache 58 | .benchmarks 59 | python-driver-* 60 | 61 | # Vscode 62 | .vscode/ 63 | 64 | *.iml 65 | 66 | # python virtual env 67 | venv 68 | 69 | # pyenv version file 70 | .python-version 71 | 72 | # Vim 73 | .*.swp 74 | *.swp 75 | tags 76 | 77 | # Emacs 78 | .#* 79 | 80 | # tools 81 | tools/prometheus* 82 | 83 | # ray project files 84 | project-id 85 | .mypy_cache/ 86 | 87 | # XGBoost models from examples 88 | *.xgb 89 | 90 | # Downloaded test data 91 | *.csv 92 | *.csv.gz 93 | *.parquet 94 | 95 | # Byte-compiled files 96 | __pycache__/ -------------------------------------------------------------------------------- /examples: -------------------------------------------------------------------------------- 1 | xgboost_ray/examples -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Black + Clang formatter (if installed). This script formats all changed files from the last mergebase. 3 | # You are encouraged to run this locally before pushing changes for review. 4 | 5 | # Cause the script to exit if a single command fails 6 | set -euo pipefail 7 | 8 | FLAKE8_VERSION_REQUIRED="3.9.1" 9 | BLACK_VERSION_REQUIRED="22.10.0" 10 | SHELLCHECK_VERSION_REQUIRED="0.7.1" 11 | ISORT_VERSION_REQUIRED="5.10.1" 12 | 13 | check_python_command_exist() { 14 | VERSION="" 15 | case "$1" in 16 | black) 17 | VERSION=$BLACK_VERSION_REQUIRED 18 | ;; 19 | flake8) 20 | VERSION=$FLAKE8_VERSION_REQUIRED 21 | ;; 22 | isort) 23 | VERSION=$ISORT_VERSION_REQUIRED 24 | ;; 25 | *) 26 | echo "$1 is not a required dependency" 27 | exit 1 28 | esac 29 | if ! [ -x "$(command -v "$1")" ]; then 30 | echo "$1 not installed. Install the python package with: pip install $1==$VERSION" 31 | exit 1 32 | fi 33 | } 34 | 35 | check_docstyle() { 36 | echo "Checking docstyle..." 37 | violations=$(git ls-files | grep '.py$' | xargs grep -E '^[ ]+[a-z_]+ ?\([a-zA-Z]+\): ' | grep -v 'str(' | grep -v noqa || true) 38 | if [[ -n "$violations" ]]; then 39 | echo 40 | echo "=== Found Ray docstyle violations ===" 41 | echo "$violations" 42 | echo 43 | echo "Per the Google pydoc style, omit types from pydoc args as they are redundant: https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#code-style " 44 | echo "If this is a false positive, you can add a '# noqa' comment to the line to ignore." 45 | exit 1 46 | fi 47 | return 0 48 | } 49 | 50 | check_python_command_exist black 51 | check_python_command_exist flake8 52 | check_python_command_exist isort 53 | 54 | # this stops git rev-parse from failing if we run this from the .git directory 55 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")" 56 | 57 | ROOT="$(git rev-parse --show-toplevel)" 58 | builtin cd "$ROOT" || exit 1 59 | 60 | # NOTE(edoakes): black version differs based on installation method: 61 | # Option 1) 'black, 21.12b0 (compiled: no)' 62 | # Option 2) 'black, version 21.12b0' 63 | # For newer versions (at least 22.10.0), a second line is printed which must be dropped: 64 | # 65 | # black, 22.10.0 (compiled: yes) 66 | # Python (CPython) 3.9.13 67 | BLACK_VERSION_STR=$(black --version) 68 | if [[ "$BLACK_VERSION_STR" == *"compiled"* ]] 69 | then 70 | BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $2}') 71 | else 72 | BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $3}') 73 | fi 74 | FLAKE8_VERSION=$(flake8 --version | head -n 1 | awk '{print $1}') 75 | ISORT_VERSION=$(isort --version | grep VERSION | awk '{print $2}') 76 | 77 | # params: tool name, tool version, required version 78 | tool_version_check() { 79 | if [ "$2" != "$3" ]; then 80 | echo "WARNING: Ray uses $1 $3, You currently are using $2. This might generate different results." 81 | fi 82 | } 83 | 84 | tool_version_check "flake8" "$FLAKE8_VERSION" "$FLAKE8_VERSION_REQUIRED" 85 | tool_version_check "black" "$BLACK_VERSION" "$BLACK_VERSION_REQUIRED" 86 | tool_version_check "isort" "$ISORT_VERSION" "$ISORT_VERSION_REQUIRED" 87 | 88 | if command -v shellcheck >/dev/null; then 89 | SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}') 90 | tool_version_check "shellcheck" "$SHELLCHECK_VERSION" "$SHELLCHECK_VERSION_REQUIRED" 91 | else 92 | echo "INFO: Ray uses shellcheck for shell scripts, which is not installed. You may install shellcheck=$SHELLCHECK_VERSION_REQUIRED with your system package manager." 93 | fi 94 | 95 | if command -v clang-format >/dev/null; then 96 | CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}') 97 | tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.0" 98 | else 99 | echo "WARNING: clang-format is not installed!" 100 | fi 101 | 102 | if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then 103 | echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes" 104 | fi 105 | 106 | if [[ $(flake8 --version) != *"flake8-bugbear"* ]]; then 107 | echo "WARNING: Ray uses flake8 with flake8-bugbear. Might error without it. Install with: pip install flake8-bugbear" 108 | fi 109 | 110 | SHELLCHECK_FLAGS=( 111 | --exclude=1090 # "Can't follow non-constant source. Use a directive to specify location." 112 | --exclude=1091 # "Not following {file} due to some error" 113 | --exclude=2207 # "Prefer mapfile or read -a to split command output (or quote to avoid splitting)." -- these aren't compatible with macOS's old Bash 114 | ) 115 | 116 | 117 | BLACK_EXCLUDES=( 118 | '--force-exclude' 119 | 'python/ray/cloudpickle/*|'` 120 | `'python/build/*|'` 121 | `'python/ray/core/src/ray/gcs/*|'` 122 | `'python/ray/thirdparty_files/*|'` 123 | `'python/ray/_private/thirdparty/*|'` 124 | `'python/ray/serve/tests/test_config_files/syntax_error\.py' 125 | ) 126 | 127 | GIT_LS_EXCLUDES=( 128 | ':(exclude)python/ray/cloudpickle/' 129 | ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py' 130 | ) 131 | 132 | # TODO(barakmich): This should be cleaned up. I've at least excised the copies 133 | # of these arguments to this location, but the long-term answer is to actually 134 | # make a flake8 config file 135 | FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605" 136 | 137 | shellcheck_scripts() { 138 | shellcheck "${SHELLCHECK_FLAGS[@]}" "$@" 139 | } 140 | 141 | # Format specified files 142 | format_files() { 143 | local shell_files=() python_files=() bazel_files=() 144 | 145 | local name 146 | for name in "$@"; do 147 | local base="${name%.*}" 148 | local suffix="${name#"${base}"}" 149 | 150 | local shebang="" 151 | read -r shebang < "${name}" || true 152 | case "${shebang}" in 153 | '#!'*) 154 | shebang="${shebang#/usr/bin/env }" 155 | shebang="${shebang%% *}" 156 | shebang="${shebang##*/}" 157 | ;; 158 | esac 159 | 160 | if [ "${base}" = "WORKSPACE" ] || [ "${base}" = "BUILD" ] || [ "${suffix}" = ".BUILD" ] || [ "${suffix}" = ".bazel" ] || [ "${suffix}" = ".bzl" ]; then 161 | bazel_files+=("${name}") 162 | elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang#python}" ] || [ "${suffix}" != "${suffix#.py}" ]; then 163 | python_files+=("${name}") 164 | elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang%sh}" ] || [ "${suffix}" != "${suffix#.sh}" ]; then 165 | shell_files+=("${name}") 166 | else 167 | echo "error: failed to determine file type: ${name}" 1>&2 168 | return 1 169 | fi 170 | done 171 | 172 | if [ 0 -lt "${#python_files[@]}" ]; then 173 | isort "${python_files[@]}" 174 | black "${python_files[@]}" 175 | fi 176 | 177 | if command -v shellcheck >/dev/null; then 178 | if shellcheck --shell=sh --format=diff - < /dev/null; then 179 | if [ 0 -lt "${#shell_files[@]}" ]; then 180 | local difference 181 | difference="$(shellcheck_scripts --format=diff "${shell_files[@]}" || true && printf "-")" 182 | difference="${difference%-}" 183 | printf "%s" "${difference}" | patch -p1 184 | fi 185 | else 186 | echo "error: this version of shellcheck does not support diffs" 187 | fi 188 | fi 189 | } 190 | 191 | format_all_scripts() { 192 | command -v flake8 &> /dev/null; 193 | HAS_FLAKE8=$? 194 | 195 | # Run isort before black to fix imports and let black deal with file format. 196 | echo "$(date)" "isort...." 197 | git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ 198 | isort 199 | echo "$(date)" "Black...." 200 | git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ 201 | black "${BLACK_EXCLUDES[@]}" 202 | if [ $HAS_FLAKE8 ]; then 203 | echo "$(date)" "Flake8...." 204 | git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \ 205 | flake8 --config=.flake8 206 | fi 207 | 208 | if command -v shellcheck >/dev/null; then 209 | local shell_files non_shell_files 210 | non_shell_files=($(git ls-files -- ':(exclude)*.sh')) 211 | shell_files=($(git ls-files -- '*.sh')) 212 | if [ 0 -lt "${#non_shell_files[@]}" ]; then 213 | shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) 214 | fi 215 | if [ 0 -lt "${#shell_files[@]}" ]; then 216 | echo "$(date)" "shellcheck scripts...." 217 | shellcheck_scripts "${shell_files[@]}" 218 | fi 219 | fi 220 | } 221 | 222 | # Format files that differ from main branch. Ignores dirs that are not slated 223 | # for autoformat yet. 224 | format_changed() { 225 | # The `if` guard ensures that the list of filenames is not empty, which 226 | # could cause the formatter to receive 0 positional arguments, making 227 | # Black error. 228 | # 229 | # `diff-filter=ACRM` and $MERGEBASE is to ensure we only format files that 230 | # exist on both branches. 231 | MERGEBASE="$(git merge-base upstream/master HEAD)" 232 | 233 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then 234 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ 235 | isort 236 | fi 237 | 238 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then 239 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ 240 | black "${BLACK_EXCLUDES[@]}" 241 | if which flake8 >/dev/null; then 242 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ 243 | flake8 --config=.flake8 244 | fi 245 | fi 246 | 247 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then 248 | if which flake8 >/dev/null; then 249 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \ 250 | flake8 --config=.flake8 "$FLAKE8_PYX_IGNORES" 251 | fi 252 | fi 253 | 254 | if which clang-format >/dev/null; then 255 | if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.cc' '*.h' &>/dev/null; then 256 | git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.cc' '*.h' | xargs -P 5 \ 257 | clang-format -i 258 | fi 259 | fi 260 | 261 | if command -v shellcheck >/dev/null; then 262 | local shell_files non_shell_files 263 | non_shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- ':(exclude)*.sh')) 264 | shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.sh')) 265 | if [ 0 -lt "${#non_shell_files[@]}" ]; then 266 | shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) 267 | fi 268 | if [ 0 -lt "${#shell_files[@]}" ]; then 269 | shellcheck_scripts "${shell_files[@]}" 270 | fi 271 | fi 272 | } 273 | 274 | # This flag formats individual files. --files *must* be the first command line 275 | # arg to use this option. 276 | if [ "${1-}" == '--files' ]; then 277 | format_files "${@:2}" 278 | # If `--all` or `--scripts` are passed, then any further arguments are ignored. 279 | # Format the entire python directory and other scripts. 280 | elif [ "${1-}" == '--all-scripts' ]; then 281 | format_all_scripts "${@}" 282 | if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi 283 | # Format the all Python, C++, Java and other script files. 284 | elif [ "${1-}" == '--all' ]; then 285 | format_all_scripts "${@}" 286 | if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi 287 | else 288 | # Add the upstream remote if it doesn't exist 289 | if ! git remote -v | grep -q upstream; then 290 | git remote add 'upstream' 'https://github.com/ray-project/xgboost_ray.git' 291 | fi 292 | 293 | # Only fetch master since that's the branch we're diffing against. 294 | git fetch upstream master || true 295 | 296 | # Format only the files that changed in last commit. 297 | format_changed 298 | fi 299 | 300 | check_docstyle 301 | 302 | if ! git diff --quiet &>/dev/null; then 303 | echo 'Reformatted changed files. Please review and stage the changes.' 304 | echo 'Files updated:' 305 | echo 306 | 307 | git --no-pager diff --name-only 308 | 309 | exit 1 310 | fi 311 | -------------------------------------------------------------------------------- /requirements/lint-requirements.txt: -------------------------------------------------------------------------------- 1 | flake8==3.9.1 2 | flake8-comprehensions==3.10.1 3 | flake8-quotes==2.0.0 4 | flake8-bugbear==21.9.2 5 | black==22.10.0 6 | isort==5.10.1 7 | importlib-metadata==4.13.0 8 | -------------------------------------------------------------------------------- /requirements/test-requirements.txt: -------------------------------------------------------------------------------- 1 | packaging 2 | petastorm 3 | pytest 4 | pyarrow<15.0.0 5 | ray[tune, data, default] 6 | scikit-learn 7 | # modin==0.23.1.post0 is not compatible with xgboost_ray py38 8 | modin<=0.23.1; python_version == '3.8' 9 | # modin==0.26.0 is not compatible with xgboost_ray py39+ 10 | modin<0.26.0; python_version > '3.8' 11 | dask 12 | 13 | #workaround for now 14 | protobuf<4.0.0 15 | tensorboardX==2.2 16 | -------------------------------------------------------------------------------- /run_ci_examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | TUNE=1 5 | 6 | for i in "$@" 7 | do 8 | echo "$i" 9 | case "$i" in 10 | --no-tune) 11 | TUNE=0 12 | ;; 13 | *) 14 | echo "unknown arg, $i" 15 | exit 1 16 | ;; 17 | esac 18 | done 19 | 20 | pushd xgboost_ray/examples/ || exit 1 21 | ray stop || true 22 | echo "================" 23 | echo "Running examples" 24 | echo "================" 25 | echo "running readme.py" && python readme.py 26 | echo "running readme_sklearn_api.py" && python readme_sklearn_api.py 27 | echo "running simple.py" && python simple.py --smoke-test 28 | echo "running simple_predict.py" && python simple_predict.py 29 | echo "running simple_dask.py" && python simple_dask.py --smoke-test 30 | echo "running simple_modin.py" && python simple_modin.py --smoke-test 31 | echo "running simple_objectstore.py" && python simple_objectstore.py --smoke-test 32 | echo "running simple_ray_dataset.py" && python simple_objectstore.py --smoke-test 33 | echo "running simple_partitioned.py" && python simple_partitioned.py --smoke-test 34 | 35 | if [ "$TUNE" = "1" ]; then 36 | echo "running simple_tune.py" && python simple_tune.py --smoke-test 37 | else 38 | echo "skipping tune example" 39 | fi 40 | 41 | echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test 42 | popd 43 | 44 | pushd xgboost_ray/tests 45 | echo "running examples with Ray Client" 46 | python -m pytest -v --durations=0 -x test_client.py 47 | popd || exit 1 48 | -------------------------------------------------------------------------------- /run_ci_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | TUNE=1 3 | 4 | for i in "$@" 5 | do 6 | echo "$i" 7 | case "$i" in 8 | --no-tune) 9 | TUNE=0 10 | ;; 11 | *) 12 | echo "unknown arg, $i" 13 | exit 1 14 | ;; 15 | esac 16 | done 17 | 18 | pushd xgboost_ray/tests || exit 1 19 | echo "=============" 20 | echo "Running tests" 21 | echo "=============" 22 | END_STATUS=0 23 | if ! python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x "test_colocation.py" ; then END_STATUS=1; fi 24 | if ! python -m pytest -v --durations=0 -x "test_matrix.py" ; then END_STATUS=1; fi 25 | if ! python -m pytest -v --durations=0 -x "test_data_source.py" ; then END_STATUS=1; fi 26 | if ! python -m pytest -v --durations=0 -x "test_xgboost_api.py" ; then END_STATUS=1; fi 27 | if ! python -m pytest -v --durations=0 -x "test_fault_tolerance.py" ; then END_STATUS=1; fi 28 | if ! python -m pytest -v --durations=0 -x "test_end_to_end.py" ; then END_STATUS=1; fi 29 | if ! python -m pytest -v --durations=0 -x "test_sklearn.py" ; then END_STATUS=1; fi 30 | if ! python -m pytest -v --durations=0 -x "test_sklearn_matrix.py" ; then END_STATUS=1; fi 31 | 32 | if [ "$TUNE" = "1" ]; then 33 | if ! python -m pytest -v --durations=0 -x "test_tune.py" ; then END_STATUS=1; fi 34 | else 35 | echo "skipping tune tests" 36 | fi 37 | 38 | echo "running smoke test on benchmark_cpu_gpu.py" && if ! python release/benchmark_cpu_gpu.py 2 10 20 --smoke-test; then END_STATUS=1; fi 39 | popd || exit 1 40 | 41 | if [ "$END_STATUS" = "1" ]; then 42 | echo "At least one test has failed, exiting with code 1" 43 | fi 44 | exit "$END_STATUS" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="xgboost_ray", 5 | packages=find_packages(where=".", include="xgboost_ray*"), 6 | version="0.1.20", 7 | author="Ray Team", 8 | description="A Ray backend for distributed XGBoost", 9 | license="Apache 2.0", 10 | long_description="A distributed backend for XGBoost built on top of " 11 | "distributed computing framework Ray.", 12 | url="https://github.com/ray-project/xgboost_ray", 13 | install_requires=[ 14 | "ray>=2.7", 15 | "numpy>=1.16", 16 | "pandas", 17 | "wrapt>=1.12.1", 18 | "xgboost>=0.90", 19 | "packaging", 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /xgboost_ray/__init__.py: -------------------------------------------------------------------------------- 1 | from xgboost_ray.main import RayParams, predict, train 2 | from xgboost_ray.matrix import ( 3 | Data, 4 | RayDeviceQuantileDMatrix, 5 | RayDMatrix, 6 | RayFileType, 7 | RayShardingMode, 8 | combine_data, 9 | ) 10 | 11 | # workaround for legacy xgboost==0.9.0 12 | try: 13 | from xgboost_ray.sklearn import ( 14 | RayXGBClassifier, 15 | RayXGBRanker, 16 | RayXGBRegressor, 17 | RayXGBRFClassifier, 18 | RayXGBRFRegressor, 19 | ) 20 | except ImportError: 21 | pass 22 | 23 | __version__ = "0.1.20" 24 | 25 | __all__ = [ 26 | "__version__", 27 | "RayParams", 28 | "RayDMatrix", 29 | "RayDeviceQuantileDMatrix", 30 | "RayFileType", 31 | "RayShardingMode", 32 | "Data", 33 | "combine_data", 34 | "train", 35 | "predict", 36 | "RayXGBClassifier", 37 | "RayXGBRegressor", 38 | "RayXGBRFClassifier", 39 | "RayXGBRFRegressor", 40 | "RayXGBRanker", 41 | ] 42 | -------------------------------------------------------------------------------- /xgboost_ray/callback.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC 3 | from typing import TYPE_CHECKING, Any, Dict, Sequence, Union 4 | 5 | import pandas as pd 6 | from ray.util.annotations import DeveloperAPI, PublicAPI 7 | 8 | if TYPE_CHECKING: 9 | from xgboost_ray.main import RayXGBoostActor 10 | from xgboost_ray.matrix import RayDMatrix 11 | 12 | 13 | @PublicAPI(stability="beta") 14 | class DistributedCallback(ABC): 15 | """Distributed callbacks for RayXGBoostActors. 16 | 17 | The hooks of these callbacks are executed on the remote Ray actors 18 | at different points in time. They can be used to set environment 19 | variables or to prepare the training/prediction environment in other 20 | ways. Distributed callback objects are de-serialized on each actor 21 | and are then independent of each other - changing the state of one 22 | callback will not alter the state of the other copies on different actors. 23 | 24 | Callbacks can be passed to xgboost_ray via 25 | :class:`RayParams ` using the 26 | ``distributed_callbacks`` parameter. 27 | """ 28 | 29 | def on_init(self, actor: "RayXGBoostActor", *args, **kwargs): 30 | pass 31 | 32 | def before_data_loading( 33 | self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs 34 | ): 35 | pass 36 | 37 | def after_data_loading( 38 | self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs 39 | ): 40 | pass 41 | 42 | def before_train(self, actor: "RayXGBoostActor", *args, **kwargs): 43 | pass 44 | 45 | def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs): 46 | pass 47 | 48 | def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs): 49 | pass 50 | 51 | def after_predict( 52 | self, 53 | actor: "RayXGBoostActor", 54 | predictions: Union[pd.Series, pd.DataFrame], 55 | *args, 56 | **kwargs 57 | ): 58 | pass 59 | 60 | 61 | @DeveloperAPI 62 | class DistributedCallbackContainer: 63 | def __init__(self, callbacks: Sequence[DistributedCallback]): 64 | self.callbacks = callbacks or [] 65 | 66 | def on_init(self, actor: "RayXGBoostActor", *args, **kwargs): 67 | for callback in self.callbacks: 68 | callback.on_init(actor, *args, **kwargs) 69 | 70 | def before_data_loading( 71 | self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs 72 | ): 73 | for callback in self.callbacks: 74 | callback.before_data_loading(actor, data, *args, **kwargs) 75 | 76 | def after_data_loading( 77 | self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs 78 | ): 79 | for callback in self.callbacks: 80 | callback.after_data_loading(actor, data, *args, **kwargs) 81 | 82 | def before_train(self, actor: "RayXGBoostActor", *args, **kwargs): 83 | for callback in self.callbacks: 84 | callback.before_train(actor, *args, **kwargs) 85 | 86 | def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs): 87 | for callback in self.callbacks: 88 | callback.after_train(actor, result_dict, *args, **kwargs) 89 | 90 | def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs): 91 | for callback in self.callbacks: 92 | callback.before_predict(actor, *args, **kwargs) 93 | 94 | def after_predict( 95 | self, 96 | actor: "RayXGBoostActor", 97 | predictions: Union[pd.Series, pd.DataFrame], 98 | *args, 99 | **kwargs 100 | ): 101 | for callback in self.callbacks: 102 | callback.after_predict(actor, predictions, *args, **kwargs) 103 | 104 | 105 | class EnvironmentCallback(DistributedCallback): 106 | def __init__(self, env_dict: Dict[str, Any]): 107 | self.env_dict = env_dict 108 | 109 | def on_init(self, actor, *args, **kwargs): 110 | os.environ.update(self.env_dict) 111 | -------------------------------------------------------------------------------- /xgboost_ray/compat/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | if TYPE_CHECKING: 4 | from xgboost_ray.xgb import xgboost as xgb 5 | 6 | try: 7 | from xgboost.callback import TrainingCallback 8 | 9 | LEGACY_CALLBACK = False 10 | except ImportError: 11 | 12 | class TrainingCallback: 13 | def __init__(self): 14 | if hasattr(self, "before_iteration"): 15 | # XGBoost < 1.0 is looking up __dict__ to see if a 16 | # callback should be called before or after an iteration. 17 | # So here we move this to self._before_iteration and 18 | # overwrite the dict. 19 | self._before_iteration = getattr(self, "before_iteration") 20 | self.__dict__["before_iteration"] = True 21 | 22 | def __call__(self, callback_env: "xgb.core.CallbackEnv"): 23 | if hasattr(self, "_before_iteration"): 24 | self._before_iteration( 25 | model=callback_env.model, 26 | epoch=callback_env.iteration, 27 | evals_log=callback_env.evaluation_result_list, 28 | ) 29 | 30 | if hasattr(self, "after_iteration"): 31 | self.after_iteration( 32 | model=callback_env.model, 33 | epoch=callback_env.iteration, 34 | evals_log=callback_env.evaluation_result_list, 35 | ) 36 | 37 | def before_training(self, model): 38 | pass 39 | 40 | def after_training(self, model): 41 | pass 42 | 43 | LEGACY_CALLBACK = True 44 | 45 | try: 46 | from xgboost import RabitTracker 47 | except ImportError: 48 | from xgboost_ray.compat.tracker import RabitTracker 49 | 50 | __all__ = ["TrainingCallback", "RabitTracker"] 51 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/__init__.py: -------------------------------------------------------------------------------- 1 | from xgboost_ray.data_sources.csv import CSV 2 | from xgboost_ray.data_sources.dask import Dask 3 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 4 | from xgboost_ray.data_sources.modin import Modin 5 | from xgboost_ray.data_sources.numpy import Numpy 6 | from xgboost_ray.data_sources.object_store import ObjectStore 7 | from xgboost_ray.data_sources.pandas import Pandas 8 | from xgboost_ray.data_sources.parquet import Parquet 9 | from xgboost_ray.data_sources.partitioned import Partitioned 10 | from xgboost_ray.data_sources.petastorm import Petastorm 11 | from xgboost_ray.data_sources.ray_dataset import RayDataset 12 | 13 | data_sources = [ 14 | Numpy, 15 | Pandas, 16 | Partitioned, 17 | Modin, 18 | Dask, 19 | Petastorm, 20 | CSV, 21 | Parquet, 22 | ObjectStore, 23 | RayDataset, 24 | ] 25 | 26 | __all__ = [ 27 | "DataSource", 28 | "RayFileType", 29 | "Numpy", 30 | "Pandas", 31 | "Modin", 32 | "Dask", 33 | "Petastorm", 34 | "CSV", 35 | "Parquet", 36 | "ObjectStore", 37 | "RayDataset", 38 | "Partitioned", 39 | ] 40 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/_distributed.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import math 3 | from collections import defaultdict 4 | from typing import Any, Dict, Sequence 5 | 6 | import ray 7 | from ray.actor import ActorHandle 8 | 9 | 10 | def get_actor_rank_ips(actors: Sequence[ActorHandle]) -> Dict[int, str]: 11 | """Get a dict mapping from actor ranks to their IPs""" 12 | no_obj = ray.put(None) 13 | # Build a dict mapping actor ranks to their IP addresses 14 | actor_rank_ips: Dict[int, str] = dict( 15 | enumerate( 16 | ray.get( 17 | [actor.ip.remote() if actor is not None else no_obj for actor in actors] 18 | ) 19 | ) 20 | ) 21 | return actor_rank_ips 22 | 23 | 24 | def assign_partitions_to_actors( 25 | ip_to_parts: Dict[int, Any], actor_rank_ips: Dict[int, str] 26 | ) -> Dict[int, Sequence[Any]]: 27 | """Assign partitions from a distributed dataframe to actors. 28 | 29 | This function collects distributed partitions and evenly distributes 30 | them to actors, trying to minimize data transfer by respecting 31 | co-locality. 32 | 33 | This function currently does _not_ take partition sizes into account 34 | for distributing data. It assumes that all partitions have (more or less) 35 | the same length. 36 | 37 | Instead, partitions are evenly distributed. E.g. for 8 partitions and 3 38 | actors, each actor gets assigned 2 or 3 partitions. Which partitions are 39 | assigned depends on the data locality. 40 | 41 | The algorithm is as follows: For any number of data partitions, get the 42 | Ray object references to the shards and the IP addresses where they 43 | currently live. 44 | 45 | Calculate the minimum and maximum amount of partitions per actor. These 46 | numbers should differ by at most 1. Also calculate how many actors will 47 | get more partitions assigned than the other actors. 48 | 49 | First, each actor gets assigned up to ``max_parts_per_actor`` co-located 50 | partitions. Only up to ``num_actors_with_max_parts`` actors get the 51 | maximum number of partitions, the rest try to fill the minimum. 52 | 53 | The rest of the partitions (all of which cannot be assigned to a 54 | co-located actor) are assigned to actors until there are none left. 55 | """ 56 | num_partitions = sum(len(parts) for parts in ip_to_parts.values()) 57 | num_actors = len(actor_rank_ips) 58 | min_parts_per_actor = max(0, math.floor(num_partitions / num_actors)) 59 | max_parts_per_actor = max(1, math.ceil(num_partitions / num_actors)) 60 | num_actors_with_max_parts = num_partitions % num_actors 61 | 62 | # This is our result dict that maps actor objects to a list of partitions 63 | actor_to_partitions = defaultdict(list) 64 | 65 | # First we loop through the actors and assign them partitions from their 66 | # own IPs. Do this until each actor has `min_parts_per_actor` partitions 67 | partition_assigned = True 68 | while partition_assigned: 69 | partition_assigned = False 70 | 71 | # Loop through each actor once, assigning 72 | for rank, actor_ip in actor_rank_ips.items(): 73 | num_parts_left_on_ip = len(ip_to_parts[actor_ip]) 74 | num_actor_parts = len(actor_to_partitions[rank]) 75 | 76 | if num_parts_left_on_ip > 0 and num_actor_parts < max_parts_per_actor: 77 | if num_actor_parts >= min_parts_per_actor: 78 | # Only allow up to `num_actors_with_max_parts actors to 79 | # have the maximum number of partitions assigned. 80 | if num_actors_with_max_parts <= 0: 81 | continue 82 | num_actors_with_max_parts -= 1 83 | actor_to_partitions[rank].append(ip_to_parts[actor_ip].pop(0)) 84 | partition_assigned = True 85 | 86 | # The rest of the partitions, no matter where they are located, could not 87 | # be assigned to co-located actors. Thus, we assign them 88 | # to actors who still need partitions. 89 | rest_parts = list(itertools.chain(*ip_to_parts.values())) 90 | partition_assigned = True 91 | while len(rest_parts) > 0 and partition_assigned: 92 | partition_assigned = False 93 | for rank in actor_rank_ips: 94 | num_actor_parts = len(actor_to_partitions[rank]) 95 | if num_actor_parts < max_parts_per_actor: 96 | if num_actor_parts >= min_parts_per_actor: 97 | if num_actors_with_max_parts <= 0: 98 | continue 99 | num_actors_with_max_parts -= 1 100 | actor_to_partitions[rank].append(rest_parts.pop(0)) 101 | partition_assigned = True 102 | if len(rest_parts) <= 0: 103 | break 104 | 105 | if len(rest_parts) != 0: 106 | raise RuntimeError( 107 | "There are still partitions left to assign, but no actor " 108 | "has capacity for more. This is probably a bug. Please go " 109 | "to https://github.com/ray-project/xgboost_ray to report it." 110 | ) 111 | 112 | return actor_to_partitions 113 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/csv.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterable, Optional, Sequence, Union 2 | 3 | import pandas as pd 4 | 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 6 | from xgboost_ray.data_sources.pandas import Pandas 7 | 8 | 9 | class CSV(DataSource): 10 | """Read one or many CSV files.""" 11 | 12 | supports_central_loading = True 13 | supports_distributed_loading = True 14 | 15 | @staticmethod 16 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 17 | return filetype == RayFileType.CSV 18 | 19 | @staticmethod 20 | def get_filetype(data: Any) -> Optional[RayFileType]: 21 | if data.endswith(".csv") or data.endswith("csv.gz"): 22 | return RayFileType.CSV 23 | return None 24 | 25 | @staticmethod 26 | def load_data( 27 | data: Union[str, Sequence[str]], 28 | ignore: Optional[Sequence[str]] = None, 29 | indices: Optional[Sequence[int]] = None, 30 | **kwargs 31 | ): 32 | if isinstance(data, Iterable) and not isinstance(data, str): 33 | shards = [] 34 | 35 | for i, shard in enumerate(data): 36 | if indices and i not in indices: 37 | continue 38 | shard_df = pd.read_csv(shard, **kwargs) 39 | shards.append(Pandas.load_data(shard_df, ignore=ignore)) 40 | return pd.concat(shards, copy=False) 41 | else: 42 | local_df = pd.read_csv(data, **kwargs) 43 | return Pandas.load_data(local_df, ignore=ignore) 44 | 45 | @staticmethod 46 | def get_n(data: Any): 47 | return len(list(data)) 48 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/dask.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Any, Dict, List, Optional, Sequence, Tuple, Union 3 | 4 | import pandas as pd 5 | import ray 6 | import wrapt 7 | from ray.actor import ActorHandle 8 | 9 | from xgboost_ray.data_sources._distributed import ( 10 | assign_partitions_to_actors, 11 | get_actor_rank_ips, 12 | ) 13 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 14 | 15 | try: 16 | import dask # noqa: F401 17 | from ray.util.dask import ray_dask_get 18 | 19 | DASK_INSTALLED = True 20 | except ImportError: 21 | DASK_INSTALLED = False 22 | 23 | 24 | def _assert_dask_installed(): 25 | if not DASK_INSTALLED: 26 | raise RuntimeError( 27 | "Tried to use Dask as a data source, but dask is not " 28 | "installed. This function shouldn't have been called. " 29 | "\nFIX THIS by installing dask: `pip install dask`. " 30 | "\nPlease also raise an issue on our GitHub: " 31 | "https://github.com/ray-project/xgboost_ray as this part of " 32 | "the code should not have been reached." 33 | ) 34 | 35 | 36 | @wrapt.decorator 37 | def ensure_ray_dask_initialized( 38 | func: Any, instance: Any, args: List[Any], kwargs: Any 39 | ) -> Any: 40 | _assert_dask_installed() 41 | dask.config.set(scheduler=ray_dask_get) 42 | return func(*args, **kwargs) 43 | 44 | 45 | class Dask(DataSource): 46 | """Read from distributed Dask dataframe. 47 | 48 | A `Dask dataframe `_ 49 | is a distributed drop-in replacement for pandas. 50 | 51 | Dask dataframes are stored on multiple actors, making them 52 | suitable for distributed loading. 53 | """ 54 | 55 | supports_central_loading = True 56 | supports_distributed_loading = True 57 | 58 | @staticmethod 59 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 60 | if not DASK_INSTALLED: 61 | return False 62 | from dask.dataframe import DataFrame as DaskDataFrame 63 | from dask.dataframe import Series as DaskSeries 64 | 65 | return isinstance(data, (DaskDataFrame, DaskSeries)) 66 | 67 | @ensure_ray_dask_initialized 68 | @staticmethod 69 | def load_data( 70 | data: Any, # dask.pandas.DataFrame 71 | ignore: Optional[Sequence[str]] = None, 72 | indices: Optional[Union[Sequence[int], Sequence[int]]] = None, 73 | **kwargs 74 | ) -> pd.DataFrame: 75 | _assert_dask_installed() 76 | 77 | import dask.dataframe as dd 78 | 79 | if indices is not None and len(indices) > 0 and isinstance(indices[0], Tuple): 80 | # We got a list of partition IDs belonging to Dask partitions 81 | return dd.concat([data.partitions[i] for (i,) in indices]).compute() 82 | 83 | # Dask does not support iloc() for row selection, so we have to 84 | # compute a local pandas dataframe first 85 | local_df = data.compute() 86 | 87 | if indices: 88 | local_df = local_df.iloc[indices] 89 | 90 | if ignore: 91 | local_df = local_df[local_df.columns.difference(ignore)] 92 | 93 | return local_df 94 | 95 | @ensure_ray_dask_initialized 96 | @staticmethod 97 | def convert_to_series(data: Any) -> pd.Series: 98 | _assert_dask_installed() 99 | from dask.array import Array as DaskArray 100 | from dask.dataframe import DataFrame as DaskDataFrame 101 | from dask.dataframe import Series as DaskSeries 102 | 103 | if isinstance(data, DaskDataFrame): 104 | return pd.Series(data.compute().squeeze()) 105 | elif isinstance(data, DaskSeries): 106 | return data.compute() 107 | elif isinstance(data, DaskArray): 108 | return pd.Series(data.compute()) 109 | 110 | return DataSource.convert_to_series(data) 111 | 112 | @ensure_ray_dask_initialized 113 | @staticmethod 114 | def get_actor_shards( 115 | data: Any, actors: Sequence[ActorHandle] # dask.dataframe.DataFrame 116 | ) -> Tuple[Any, Optional[Dict[int, Any]]]: 117 | _assert_dask_installed() 118 | 119 | actor_rank_ips = get_actor_rank_ips(actors) 120 | 121 | # Get IPs and partitions 122 | ip_to_parts = get_ip_to_parts(data) 123 | 124 | return data, assign_partitions_to_actors(ip_to_parts, actor_rank_ips) 125 | 126 | @ensure_ray_dask_initialized 127 | @staticmethod 128 | def get_n(data: Any): 129 | """ 130 | For naive distributed loading we just return the number of rows 131 | here. Loading by shard is achieved via `get_actor_shards()` 132 | """ 133 | return len(data) 134 | 135 | 136 | def get_ip_to_parts(data: Any) -> Dict[int, Sequence[Any]]: 137 | persisted = data.persist(scheduler=ray_dask_get) 138 | name = persisted._name 139 | 140 | node_ids_to_node = {node["NodeID"]: node for node in ray.state.nodes()} 141 | 142 | # This is a hacky way to get the partition node IDs, and it's not 143 | # 100% accurate as the map task could get scheduled on a different node 144 | # (though Ray tries to keep locality). We need to use that until 145 | # ray.state.objects() or something like it is available again. 146 | partition_locations_df = persisted.map_partitions( 147 | lambda df: pd.DataFrame([ray.get_runtime_context().get_node_id()]) 148 | ).compute() 149 | partition_locations = [ 150 | partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size) 151 | ] 152 | 153 | ip_to_parts = defaultdict(list) 154 | for (obj_name, pid), obj_ref in dask.base.collections_to_dsk([persisted]).items(): 155 | assert obj_name == name 156 | 157 | if isinstance(obj_ref, ray.ObjectRef): 158 | node_id = partition_locations[pid] 159 | node = node_ids_to_node.get(node_id, {}) 160 | ip = node.get("NodeManagerAddress", "_no_ip") 161 | else: 162 | ip = "_no_ip" 163 | 164 | # Pass tuples here (integers can be misinterpreted as row numbers) 165 | ip_to_parts[ip].append((pid,)) 166 | 167 | return ip_to_parts 168 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/data_source.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union 3 | 4 | import pandas as pd 5 | from ray.actor import ActorHandle 6 | from ray.util.annotations import PublicAPI 7 | 8 | if TYPE_CHECKING: 9 | from xgboost_ray.xgb import xgboost as xgb 10 | 11 | 12 | @PublicAPI(stability="beta") 13 | class RayFileType(Enum): 14 | """Enum for different file types (used for overrides).""" 15 | 16 | CSV = 1 17 | PARQUET = 2 18 | PETASTORM = 3 19 | 20 | 21 | @PublicAPI(stability="beta") 22 | class DataSource: 23 | """Abstract class for data sources. 24 | 25 | xgboost_ray supports reading from various sources, such as files 26 | (e.g. CSV, Parquet) or distributed datasets (Modin). 27 | 28 | This abstract class defines an interface to read from these sources. 29 | New data sources can be added by implementing this interface. 30 | 31 | ``DataSource`` classes are not instantiated. Instead, static and 32 | class methods are called directly. 33 | """ 34 | 35 | supports_central_loading = True 36 | supports_distributed_loading = False 37 | needs_partitions = True 38 | 39 | @staticmethod 40 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 41 | """Check if the supplied data matches this data source. 42 | 43 | Args: 44 | data: Dataset. 45 | filetype: RayFileType of the provided 46 | dataset. Some DataSource implementations might require 47 | that this is explicitly set (e.g. if multiple sources can 48 | read CSV files). 49 | 50 | Returns: 51 | Boolean indicating if this data source belongs to/is compatible 52 | with the data. 53 | """ 54 | return False 55 | 56 | @staticmethod 57 | def get_filetype(data: Any) -> Optional[RayFileType]: 58 | """Method to help infer the filetype. 59 | 60 | Returns None if the supplied data type (usually a filename) 61 | is not covered by this data source, otherwise the filetype 62 | is returned. 63 | 64 | Args: 65 | data: Data set 66 | 67 | Returns: 68 | RayFileType or None. 69 | """ 70 | return None 71 | 72 | @staticmethod 73 | def load_data( 74 | data: Any, 75 | ignore: Optional[Sequence[str]] = None, 76 | indices: Optional[Sequence[Any]] = None, 77 | **kwargs 78 | ) -> pd.DataFrame: 79 | """ 80 | Load data into a pandas dataframe. 81 | 82 | Ignore specific columns, and optionally select specific indices. 83 | 84 | Args: 85 | data: Input data 86 | ignore: Column names to ignore 87 | indices: Indices to select. What an 88 | index indicates depends on the data source. 89 | 90 | Returns: 91 | Pandas DataFrame. 92 | """ 93 | raise NotImplementedError 94 | 95 | @staticmethod 96 | def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]): 97 | """Optionally update feature names before training/prediction 98 | 99 | Args: 100 | matrix: xgboost DMatrix object. 101 | feature_names: Feature names manually passed to the 102 | ``RayDMatrix`` object. 103 | 104 | """ 105 | pass 106 | 107 | @staticmethod 108 | def convert_to_series(data: Any) -> pd.Series: 109 | """Convert data from the data source type to a pandas series""" 110 | if isinstance(data, pd.DataFrame): 111 | return pd.Series(data.squeeze()) 112 | 113 | if not isinstance(data, pd.Series): 114 | return pd.Series(data) 115 | 116 | return data 117 | 118 | @classmethod 119 | def get_column( 120 | cls, data: pd.DataFrame, column: Any 121 | ) -> Tuple[pd.Series, Optional[Union[str, List]]]: 122 | """Helper method wrapping around convert to series. 123 | 124 | This method should usually not be overwritten. 125 | """ 126 | if isinstance(column, str) or isinstance(column, List): 127 | return data[column], column 128 | elif column is not None: 129 | return cls.convert_to_series(column), None 130 | return column, None 131 | 132 | @staticmethod 133 | def get_n(data: Any): 134 | """Get length of data source partitions for sharding.""" 135 | return len(data) 136 | 137 | @staticmethod 138 | def get_actor_shards( 139 | data: Any, actors: Sequence[ActorHandle] 140 | ) -> Tuple[Any, Optional[Dict[int, Any]]]: 141 | """Get a dict mapping actor ranks to shards. 142 | 143 | Args: 144 | data: Data to shard. 145 | 146 | Returns: 147 | Returns a tuple of which the first element indicates the new 148 | data object that will overwrite the existing data object 149 | in the RayDMatrix (e.g. when the object is not serializable). 150 | The second element is a dict mapping actor ranks to shards. 151 | These objects are usually passed to the ``load_data()`` method 152 | for distributed loading, so that method needs to be able to 153 | deal with the respective data. 154 | """ 155 | return data, None 156 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/modin.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Any, Dict, Optional, Sequence, Tuple, Union 3 | 4 | import pandas as pd 5 | import ray 6 | from ray import ObjectRef 7 | from ray.actor import ActorHandle 8 | 9 | from xgboost_ray.data_sources._distributed import ( 10 | assign_partitions_to_actors, 11 | get_actor_rank_ips, 12 | ) 13 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 14 | from xgboost_ray.data_sources.object_store import ObjectStore 15 | 16 | try: 17 | import modin # noqa: F401 18 | from modin.config.envvars import Engine 19 | from modin.distributed.dataframe.pandas import unwrap_partitions # noqa: F401 20 | from modin.pandas import DataFrame as ModinDataFrame # noqa: F401 21 | from modin.pandas import Series as ModinSeries # noqa: F401 22 | from packaging.version import Version 23 | 24 | MODIN_INSTALLED = Version(modin.__version__) >= Version("0.9.0") 25 | 26 | # Check if importing the Ray engine leads to errors 27 | Engine().get() 28 | 29 | except (ImportError, AttributeError): 30 | MODIN_INSTALLED = False 31 | 32 | 33 | def _assert_modin_installed(): 34 | if not MODIN_INSTALLED: 35 | raise RuntimeError( 36 | "Tried to use Modin as a data source, but modin is not " 37 | "installed or it conflicts with the pandas version. " 38 | "This function shouldn't have been called. " 39 | "\nFIX THIS by installing modin: `pip install modin` " 40 | "and making sure that the installed pandas version is " 41 | "supported by modin." 42 | "\nPlease also raise an issue on our GitHub: " 43 | "https://github.com/ray-project/xgboost_ray as this part of " 44 | "the code should not have been reached." 45 | ) 46 | 47 | 48 | class Modin(DataSource): 49 | """Read from distributed Modin dataframe. 50 | 51 | `Modin `_ is a distributed 52 | drop-in replacement for pandas supporting Ray as a backend. 53 | 54 | Modin dataframes are stored on multiple actors, making them 55 | suitable for distributed loading. 56 | """ 57 | 58 | supports_central_loading = True 59 | supports_distributed_loading = True 60 | 61 | @staticmethod 62 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 63 | if not MODIN_INSTALLED: 64 | return False 65 | # Has to be imported again. 66 | from modin.pandas import DataFrame as ModinDataFrame # noqa: F811 67 | from modin.pandas import Series as ModinSeries # noqa: F811 68 | 69 | return isinstance(data, (ModinDataFrame, ModinSeries)) 70 | 71 | @staticmethod 72 | def load_data( 73 | data: Any, # modin.pandas.DataFrame 74 | ignore: Optional[Sequence[str]] = None, 75 | indices: Optional[Union[Sequence[int], Sequence[ObjectRef]]] = None, 76 | **kwargs 77 | ) -> pd.DataFrame: 78 | _assert_modin_installed() 79 | 80 | if ( 81 | indices is not None 82 | and len(indices) > 0 83 | and isinstance(indices[0], ObjectRef) 84 | ): 85 | # We got a list of ObjectRefs belonging to Modin partitions 86 | return ObjectStore.load_data(data=indices, indices=None, ignore=ignore) 87 | 88 | local_df = data 89 | if indices: 90 | local_df = local_df.iloc[indices] 91 | 92 | local_df = local_df._to_pandas() 93 | 94 | if ignore: 95 | local_df = local_df[local_df.columns.difference(ignore)] 96 | 97 | return local_df 98 | 99 | @staticmethod 100 | def convert_to_series(data: Any) -> pd.Series: 101 | _assert_modin_installed() 102 | # Has to be imported again. 103 | from modin.pandas import DataFrame as ModinDataFrame # noqa: F811 104 | from modin.pandas import Series as ModinSeries # noqa: F811 105 | 106 | if isinstance(data, ModinDataFrame): 107 | return pd.Series(data._to_pandas().squeeze()) 108 | elif isinstance(data, ModinSeries): 109 | return data._to_pandas() 110 | 111 | return DataSource.convert_to_series(data) 112 | 113 | @staticmethod 114 | def get_actor_shards( 115 | data: Any, actors: Sequence[ActorHandle] # modin.pandas.DataFrame 116 | ) -> Tuple[Any, Optional[Dict[int, Any]]]: 117 | _assert_modin_installed() 118 | 119 | # Has to be imported again. 120 | from modin.distributed.dataframe.pandas import unwrap_partitions # noqa: F811 121 | 122 | actor_rank_ips = get_actor_rank_ips(actors) 123 | 124 | # Get IPs and partitions 125 | unwrapped = unwrap_partitions(data, axis=0, get_ip=True) 126 | ip_objs, part_objs = zip(*unwrapped) 127 | 128 | # Build a table mapping from IP to list of partitions 129 | ip_to_parts = defaultdict(list) 130 | for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs): 131 | ip_to_parts[ip].append(part_obj) 132 | 133 | # Modin dataframes are not serializable, so pass None here 134 | # as the first return value 135 | return None, assign_partitions_to_actors(ip_to_parts, actor_rank_ips) 136 | 137 | @staticmethod 138 | def get_n(data: Any): 139 | """ 140 | For naive distributed loading we just return the number of rows 141 | here. Loading by shard is achieved via `get_actor_shards()` 142 | """ 143 | return len(data) 144 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/numpy.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any, List, Optional, Sequence 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 7 | from xgboost_ray.data_sources.pandas import Pandas 8 | 9 | if TYPE_CHECKING: 10 | from xgboost_ray.xgb import xgboost as xgb 11 | 12 | 13 | class Numpy(DataSource): 14 | """Read from numpy arrays.""" 15 | 16 | @staticmethod 17 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 18 | return isinstance(data, np.ndarray) 19 | 20 | @staticmethod 21 | def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]): 22 | # Potentially unset feature names 23 | matrix.feature_names = feature_names 24 | 25 | @staticmethod 26 | def load_data( 27 | data: np.ndarray, 28 | ignore: Optional[Sequence[str]] = None, 29 | indices: Optional[Sequence[int]] = None, 30 | **kwargs, 31 | ) -> pd.DataFrame: 32 | local_df = pd.DataFrame(data, columns=[f"f{i}" for i in range(data.shape[1])]) 33 | return Pandas.load_data(local_df, ignore=ignore, indices=indices) 34 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/object_store.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Sequence 2 | 3 | import pandas as pd 4 | import ray 5 | from ray import ObjectRef 6 | 7 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 8 | from xgboost_ray.data_sources.pandas import Pandas 9 | 10 | 11 | class ObjectStore(DataSource): 12 | """Read pandas dataframes and series from ray object store.""" 13 | 14 | @staticmethod 15 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 16 | if isinstance(data, Sequence): 17 | return all(isinstance(d, ObjectRef) for d in data) 18 | return isinstance(data, ObjectRef) 19 | 20 | @staticmethod 21 | def load_data( 22 | data: Sequence[ObjectRef], 23 | ignore: Optional[Sequence[str]] = None, 24 | indices: Optional[Sequence[int]] = None, 25 | **kwargs 26 | ) -> pd.DataFrame: 27 | if indices is not None: 28 | data = [data[i] for i in indices] 29 | 30 | local_df = ray.get(data) 31 | 32 | return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore) 33 | 34 | @staticmethod 35 | def convert_to_series(data: Any) -> pd.Series: 36 | if isinstance(data, ObjectRef): 37 | data = ray.get(data) 38 | else: 39 | data = pd.concat(ray.get(data), copy=False) 40 | return DataSource.convert_to_series(data) 41 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/pandas.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Sequence 2 | 3 | import pandas as pd 4 | 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 6 | 7 | 8 | class Pandas(DataSource): 9 | """Read from pandas dataframes and series.""" 10 | 11 | @staticmethod 12 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 13 | return isinstance(data, (pd.DataFrame, pd.Series)) 14 | 15 | @staticmethod 16 | def load_data( 17 | data: Any, 18 | ignore: Optional[Sequence[str]] = None, 19 | indices: Optional[Sequence[int]] = None, 20 | **kwargs 21 | ) -> pd.DataFrame: 22 | local_df = data 23 | 24 | if ignore: 25 | local_df = local_df[local_df.columns.difference(ignore)] 26 | 27 | if indices: 28 | return local_df.iloc[indices] 29 | 30 | return local_df 31 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/parquet.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterable, Optional, Sequence, Union 2 | 3 | import pandas as pd 4 | 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 6 | from xgboost_ray.data_sources.pandas import Pandas 7 | 8 | 9 | class Parquet(DataSource): 10 | """Read one or many Parquet files.""" 11 | 12 | supports_central_loading = True 13 | supports_distributed_loading = True 14 | 15 | @staticmethod 16 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 17 | return filetype == RayFileType.PARQUET 18 | 19 | @staticmethod 20 | def get_filetype(data: Any) -> Optional[RayFileType]: 21 | if data.endswith(".parquet"): 22 | return RayFileType.PARQUET 23 | return None 24 | 25 | @staticmethod 26 | def load_data( 27 | data: Union[str, Sequence[str]], 28 | ignore: Optional[Sequence[str]] = None, 29 | indices: Optional[Sequence[int]] = None, 30 | **kwargs 31 | ) -> pd.DataFrame: 32 | if isinstance(data, Iterable) and not isinstance(data, str): 33 | shards = [] 34 | 35 | for i, shard in enumerate(data): 36 | if indices and i not in indices: 37 | continue 38 | 39 | shard_df = pd.read_parquet(shard, **kwargs) 40 | shards.append(Pandas.load_data(shard_df, ignore=ignore)) 41 | return pd.concat(shards, copy=False) 42 | else: 43 | local_df = pd.read_parquet(data, **kwargs) 44 | return Pandas.load_data(local_df, ignore=ignore) 45 | 46 | @staticmethod 47 | def get_n(data: Any): 48 | return len(list(data)) 49 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/partitioned.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Any, Dict, Optional, Sequence, Tuple 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from ray import ObjectRef 7 | from ray.actor import ActorHandle 8 | 9 | from xgboost_ray.data_sources._distributed import ( 10 | assign_partitions_to_actors, 11 | get_actor_rank_ips, 12 | ) 13 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 14 | from xgboost_ray.data_sources.numpy import Numpy 15 | from xgboost_ray.data_sources.pandas import Pandas 16 | 17 | 18 | class Partitioned(DataSource): 19 | """Read from distributed data structure implementing __partitioned__. 20 | 21 | __partitioned__ provides meta data about how the data is partitioned and 22 | distributed across several compute nodes, making supporting objects them 23 | suitable for distributed loading. 24 | 25 | Also see the __partitioned__ spec: 26 | https://github.com/IntelPython/DPPY-Spec/blob/draft/partitioned/Partitioned.md 27 | """ 28 | 29 | supports_central_loading = True 30 | supports_distributed_loading = True 31 | 32 | @staticmethod 33 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 34 | return hasattr(data, "__partitioned__") 35 | 36 | @staticmethod 37 | def load_data( 38 | data: Any, # __partitioned__ dict 39 | ignore: Optional[Sequence[str]] = None, 40 | indices: Optional[Sequence[ObjectRef]] = None, 41 | **kwargs 42 | ) -> pd.DataFrame: 43 | 44 | assert isinstance(data, dict), "Expected __partitioned__ dict" 45 | _get = data["get"] 46 | 47 | if indices is None or len(indices) == 0: 48 | tiling = data["partition_tiling"] 49 | ndims = len(tiling) 50 | # we need tuples to access partitions in the right order 51 | pos_suffix = (0,) * (ndims - 1) 52 | parts = data["partitions"] 53 | # get the full data, e.g. all shards/partitions 54 | local_df = [ 55 | _get(parts[(i,) + pos_suffix]["data"]) for i in range(tiling[0]) 56 | ] 57 | else: 58 | # here we got a list of futures for partitions 59 | local_df = _get(indices) 60 | 61 | if isinstance(local_df[0], pd.DataFrame): 62 | return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore) 63 | else: 64 | return Numpy.load_data(np.concatenate(local_df), ignore=ignore) 65 | 66 | @staticmethod 67 | def get_actor_shards( 68 | data: Any, actors: Sequence[ActorHandle] # partitioned.pandas.DataFrame 69 | ) -> Tuple[Any, Optional[Dict[int, Any]]]: 70 | assert hasattr(data, "__partitioned__") 71 | 72 | actor_rank_ips = get_actor_rank_ips(actors) 73 | 74 | # Get accessor func and partitions 75 | parted = data.__partitioned__ 76 | parts = parted["partitions"] 77 | tiling = parted["partition_tiling"] 78 | ndims = len(tiling) 79 | if ndims < 1 or ndims > 2 or any(tiling[x] != 1 for x in range(1, ndims)): 80 | raise RuntimeError( 81 | "Only row-wise partitionings of 1d/2d structures supported." 82 | ) 83 | 84 | # Now build a table mapping from IP to list of partitions 85 | ip_to_parts = defaultdict(lambda: []) 86 | # we need tuples to access partitions in the right order 87 | pos_suffix = (0,) * (ndims - 1) 88 | for i in range(tiling[0]): 89 | part = parts[(i,) + pos_suffix] # this works for 1d and 2d 90 | ip_to_parts[part["location"][0]].append(part["data"]) 91 | # __partitioned__ is serializable, so pass it here 92 | # as the first return value 93 | ret = parted, assign_partitions_to_actors(ip_to_parts, actor_rank_ips) 94 | return ret 95 | 96 | @staticmethod 97 | def get_n(data: Any): 98 | """Get length of data source partitions for sharding.""" 99 | return data.__partitioned__["shape"][0] 100 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/petastorm.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional, Sequence, Union 2 | 3 | import pandas as pd 4 | 5 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 6 | 7 | try: 8 | import petastorm 9 | 10 | PETASTORM_INSTALLED = True 11 | except ImportError: 12 | PETASTORM_INSTALLED = False 13 | 14 | 15 | def _assert_petastorm_installed(): 16 | if not PETASTORM_INSTALLED: 17 | raise RuntimeError( 18 | "Tried to use Petastorm as a data source, but petastorm is not " 19 | "installed. This function shouldn't have been called. " 20 | "\nFIX THIS by installing petastorm: `pip install petastorm`. " 21 | "\nPlease also raise an issue on our GitHub: " 22 | "https://github.com/ray-project/xgboost_ray as this part of " 23 | "the code should not have been reached." 24 | ) 25 | 26 | 27 | class Petastorm(DataSource): 28 | """Read with Petastorm. 29 | 30 | `Petastorm `_ is a machine learning 31 | training and evaluation library. 32 | 33 | This class accesses Petastorm's dataset loading interface for efficient 34 | loading of large datasets. 35 | """ 36 | 37 | supports_central_loading = True 38 | supports_distributed_loading = True 39 | 40 | @staticmethod 41 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 42 | return PETASTORM_INSTALLED and filetype == RayFileType.PETASTORM 43 | 44 | @staticmethod 45 | def get_filetype(data: Any) -> Optional[RayFileType]: 46 | if not PETASTORM_INSTALLED: 47 | return None 48 | 49 | if not isinstance(data, List): 50 | data = [data] 51 | 52 | def _is_compatible(url: str): 53 | return url.endswith(".parquet") and ( 54 | url.startswith("s3://") 55 | or url.startswith("gs://") 56 | or url.startswith("hdfs://") 57 | or url.startswith("file://") 58 | ) 59 | 60 | if all(_is_compatible(url) for url in data): 61 | return RayFileType.PETASTORM 62 | 63 | return None 64 | 65 | @staticmethod 66 | def load_data( 67 | data: Union[str, Sequence[str]], 68 | ignore: Optional[Sequence[str]] = None, 69 | indices: Optional[Sequence[int]] = None, 70 | **kwargs 71 | ) -> pd.DataFrame: 72 | _assert_petastorm_installed() 73 | with petastorm.make_batch_reader(data) as reader: 74 | shards = [ 75 | pd.DataFrame(batch._asdict()) 76 | for i, batch in enumerate(reader) 77 | if not indices or i in indices 78 | ] 79 | 80 | local_df = pd.concat(shards, copy=False) 81 | 82 | if ignore: 83 | local_df = local_df[local_df.columns.difference(ignore)] 84 | 85 | return local_df 86 | 87 | @staticmethod 88 | def get_n(data: Any): 89 | return len(list(data)) 90 | -------------------------------------------------------------------------------- /xgboost_ray/data_sources/ray_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Sequence, Tuple, Union 2 | 3 | import pandas as pd 4 | import ray 5 | from ray.actor import ActorHandle 6 | 7 | from xgboost_ray.data_sources.data_source import DataSource, RayFileType 8 | from xgboost_ray.data_sources.pandas import Pandas 9 | 10 | try: 11 | import ray.data.dataset # noqa: F401 12 | 13 | RAY_DATASET_AVAILABLE = True 14 | except (ImportError, AttributeError): 15 | RAY_DATASET_AVAILABLE = False 16 | 17 | DATASET_TO_PANDAS_LIMIT = float("inf") 18 | 19 | 20 | def _assert_ray_data_available(): 21 | if not RAY_DATASET_AVAILABLE: 22 | raise RuntimeError( 23 | "Tried to use Ray datasets as a data source, but your version " 24 | "of Ray does not support it. " 25 | "\nFIX THIS by upgrading Ray: `pip install -U ray`. " 26 | "\nPlease also raise an issue on our GitHub: " 27 | "https://github.com/ray-project/xgboost_ray as this part of " 28 | "the code should not have been reached." 29 | ) 30 | 31 | 32 | class RayDataset(DataSource): 33 | """Read from distributed Ray dataset.""" 34 | 35 | supports_central_loading = True 36 | supports_distributed_loading = True 37 | needs_partitions = False 38 | 39 | @staticmethod 40 | def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: 41 | if not RAY_DATASET_AVAILABLE: 42 | return False 43 | 44 | return isinstance(data, ray.data.dataset.Dataset) 45 | 46 | @staticmethod 47 | def load_data( 48 | data: "ray.data.dataset.Dataset", 49 | ignore: Optional[Sequence[str]] = None, 50 | indices: Optional[ 51 | Union[Sequence[int], Sequence["ray.data.dataset.Dataset"]] 52 | ] = None, 53 | **kwargs 54 | ) -> pd.DataFrame: 55 | _assert_ray_data_available() 56 | 57 | if indices is not None: 58 | if len(indices) > 0 and isinstance(indices[0], ray.data.dataset.Dataset): 59 | # We got a list of Datasets belonging a partition 60 | data = indices 61 | else: 62 | data = [data[i] for i in indices] 63 | 64 | if isinstance(data, ray.data.dataset.Dataset): 65 | local_df = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) 66 | else: 67 | local_df = pd.concat( 68 | [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False 69 | ) 70 | return Pandas.load_data(local_df, ignore=ignore) 71 | 72 | @staticmethod 73 | def convert_to_series( 74 | data: Union["ray.data.dataset.Dataset", Sequence["ray.data.dataset.Dataset"]] 75 | ) -> pd.Series: 76 | _assert_ray_data_available() 77 | 78 | if isinstance(data, ray.data.dataset.Dataset): 79 | data = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) 80 | else: 81 | data = pd.concat( 82 | [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False 83 | ) 84 | return DataSource.convert_to_series(data) 85 | 86 | @staticmethod 87 | def get_actor_shards( 88 | data: "ray.data.dataset.Dataset", actors: Sequence[ActorHandle] 89 | ) -> Tuple[Any, Optional[Dict[int, Any]]]: 90 | _assert_ray_data_available() 91 | 92 | # We do not use our assign_partitions_to_actors as assignment of splits 93 | # to actors is handled by locality_hints argument. 94 | 95 | dataset_splits = data.split( 96 | len(actors), 97 | equal=True, 98 | locality_hints=actors, 99 | ) 100 | 101 | return None, { 102 | i: [dataset_split] for i, dataset_split in enumerate(dataset_splits) 103 | } 104 | 105 | @staticmethod 106 | def get_n(data: "ray.data.dataset.Dataset"): 107 | """ 108 | Return number of distributed blocks. 109 | """ 110 | return data._plan.initial_num_blocks() 111 | -------------------------------------------------------------------------------- /xgboost_ray/elastic.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Callable, Dict, List, Optional, Tuple 3 | 4 | import ray 5 | 6 | from xgboost_ray.main import ( 7 | ENV, 8 | ActorHandle, 9 | RayParams, 10 | RayXGBoostActorAvailable, 11 | _create_actor, 12 | _PrepareActorTask, 13 | _TrainingState, 14 | logger, 15 | ) 16 | from xgboost_ray.matrix import RayDMatrix 17 | 18 | 19 | def _maybe_schedule_new_actors( 20 | training_state: _TrainingState, 21 | num_cpus_per_actor: int, 22 | num_gpus_per_actor: int, 23 | resources_per_actor: Optional[Dict], 24 | ray_params: RayParams, 25 | load_data: List[RayDMatrix], 26 | ) -> bool: 27 | """Schedule new actors for elastic training if resources are available. 28 | 29 | Potentially starts new actors and triggers data loading.""" 30 | 31 | # This is only enabled for elastic training. 32 | if not ray_params.elastic_training: 33 | return False 34 | 35 | missing_actor_ranks = [ 36 | rank 37 | for rank, actor in enumerate(training_state.actors) 38 | if actor is None and rank not in training_state.pending_actors 39 | ] 40 | 41 | # If all actors are alive, there is nothing to do. 42 | if not missing_actor_ranks: 43 | return False 44 | 45 | now = time.time() 46 | 47 | # Check periodically every n seconds. 48 | if ( 49 | now 50 | < training_state.last_resource_check_at + ENV.ELASTIC_RESTART_RESOURCE_CHECK_S 51 | ): 52 | return False 53 | 54 | training_state.last_resource_check_at = now 55 | 56 | new_pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = {} 57 | for rank in missing_actor_ranks: 58 | # Actor rank should not be already pending 59 | if rank in training_state.pending_actors or rank in new_pending_actors: 60 | continue 61 | 62 | # Try to schedule this actor 63 | actor = _create_actor( 64 | rank=rank, 65 | num_actors=ray_params.num_actors, 66 | num_cpus_per_actor=num_cpus_per_actor, 67 | num_gpus_per_actor=num_gpus_per_actor, 68 | resources_per_actor=resources_per_actor, 69 | placement_group=training_state.placement_group, 70 | queue=training_state.queue, 71 | checkpoint_frequency=ray_params.checkpoint_frequency, 72 | distributed_callbacks=ray_params.distributed_callbacks, 73 | ) 74 | 75 | task = _PrepareActorTask( 76 | actor, 77 | queue=training_state.queue, 78 | stop_event=training_state.stop_event, 79 | load_data=load_data, 80 | ) 81 | 82 | new_pending_actors[rank] = (actor, task) 83 | logger.debug( 84 | f"Re-scheduled actor with rank {rank}. Waiting for " 85 | f"placement and data loading before promoting it " 86 | f"to training." 87 | ) 88 | if new_pending_actors: 89 | training_state.pending_actors.update(new_pending_actors) 90 | logger.info( 91 | f"Re-scheduled {len(new_pending_actors)} actors for " 92 | f"training. Once data loading finished, they will be " 93 | f"integrated into training again." 94 | ) 95 | return bool(new_pending_actors) 96 | 97 | 98 | def _update_scheduled_actor_states(training_state: _TrainingState): 99 | """Update status of scheduled actors in elastic training. 100 | 101 | If actors finished their preparation tasks, promote them to 102 | proper training actors (set the `training_state.actors` entry). 103 | 104 | Also schedule a `RayXGBoostActorAvailable` exception so that training 105 | is restarted with the new actors. 106 | 107 | """ 108 | now = time.time() 109 | actor_became_ready = False 110 | 111 | # Wrap in list so we can alter the `training_state.pending_actors` dict 112 | for rank in list(training_state.pending_actors.keys()): 113 | actor, task = training_state.pending_actors[rank] 114 | if task.is_ready(): 115 | # Promote to proper actor 116 | training_state.actors[rank] = actor 117 | del training_state.pending_actors[rank] 118 | actor_became_ready = True 119 | 120 | if actor_became_ready: 121 | if not training_state.pending_actors: 122 | # No other actors are pending, so let's restart right away. 123 | training_state.restart_training_at = now - 1.0 124 | 125 | # If an actor became ready but other actors are pending, we wait 126 | # for n seconds before restarting, as chances are that they become 127 | # ready as well (e.g. if a large node came up). 128 | grace_period = ENV.ELASTIC_RESTART_GRACE_PERIOD_S 129 | if training_state.restart_training_at is None: 130 | logger.debug( 131 | f"A RayXGBoostActor became ready for training. Waiting " 132 | f"{grace_period} seconds before triggering training restart." 133 | ) 134 | training_state.restart_training_at = now + grace_period 135 | 136 | if training_state.restart_training_at is not None: 137 | if now > training_state.restart_training_at: 138 | training_state.restart_training_at = None 139 | raise RayXGBoostActorAvailable( 140 | "A new RayXGBoostActor became available for training. " 141 | "Triggering restart." 142 | ) 143 | 144 | 145 | def _get_actor_alive_status( 146 | actors: List[ActorHandle], callback: Callable[[ActorHandle], None] 147 | ): 148 | """Loop through all actors. Invoke a callback on dead actors.""" 149 | obj_to_rank = {} 150 | 151 | alive = 0 152 | dead = 0 153 | 154 | for rank, actor in enumerate(actors): 155 | if actor is None: 156 | dead += 1 157 | continue 158 | obj = actor.pid.remote() 159 | obj_to_rank[obj] = rank 160 | 161 | not_ready = list(obj_to_rank.keys()) 162 | while not_ready: 163 | ready, not_ready = ray.wait(not_ready, timeout=0) 164 | 165 | for obj in ready: 166 | try: 167 | pid = ray.get(obj) 168 | rank = obj_to_rank[obj] 169 | logger.debug(f"Actor {actors[rank]} with PID {pid} is alive.") 170 | alive += 1 171 | except Exception: 172 | rank = obj_to_rank[obj] 173 | logger.debug(f"Actor {actors[rank]} is _not_ alive.") 174 | dead += 1 175 | callback(actors[rank]) 176 | logger.info(f"Actor status: {alive} alive, {dead} dead " f"({alive+dead} total)") 177 | 178 | return alive, dead 179 | -------------------------------------------------------------------------------- /xgboost_ray/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ray-project/xgboost_ray/e9049256575e5bdd956b369cf86e94a298d11048/xgboost_ray/examples/__init__.py -------------------------------------------------------------------------------- /xgboost_ray/examples/create_test_data.py: -------------------------------------------------------------------------------- 1 | from xgboost_ray.tests.utils import create_parquet 2 | 3 | 4 | def main(): 5 | create_parquet( 6 | "example.parquet", 7 | num_rows=1_000_000, 8 | num_partitions=100, 9 | num_features=8, 10 | num_classes=2, 11 | ) 12 | 13 | 14 | if __name__ == "__main__": 15 | main() 16 | -------------------------------------------------------------------------------- /xgboost_ray/examples/higgs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from xgboost_ray import RayDMatrix, RayParams, train 5 | 6 | FILENAME_CSV = "HIGGS.csv.gz" 7 | 8 | 9 | def download_higgs(target_file): 10 | url = ( 11 | "https://archive.ics.uci.edu/ml/machine-learning-databases/" 12 | "00280/HIGGS.csv.gz" 13 | ) 14 | 15 | try: 16 | import urllib.request 17 | except ImportError as e: 18 | raise ValueError( 19 | f"Automatic downloading of the HIGGS dataset requires `urllib`." 20 | f"\nFIX THIS by running `pip install urllib` or manually " 21 | f"downloading the dataset from {url}." 22 | ) from e 23 | 24 | print(f"Downloading HIGGS dataset to {target_file}") 25 | urllib.request.urlretrieve(url, target_file) 26 | return os.path.exists(target_file) 27 | 28 | 29 | def main(): 30 | # Example adapted from this blog post: 31 | # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7 32 | # This uses the HIGGS dataset. Download here: 33 | # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz 34 | 35 | if not os.path.exists(FILENAME_CSV): 36 | assert download_higgs(FILENAME_CSV), "Downloading of HIGGS dataset failed." 37 | print("HIGGS dataset downloaded.") 38 | else: 39 | print("HIGGS dataset found locally.") 40 | 41 | colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] 42 | 43 | dtrain = RayDMatrix(os.path.abspath(FILENAME_CSV), label="label", names=colnames) 44 | 45 | config = { 46 | "tree_method": "hist", 47 | "eval_metric": ["logloss", "error"], 48 | } 49 | 50 | evals_result = {} 51 | 52 | start = time.time() 53 | bst = train( 54 | config, 55 | dtrain, 56 | evals_result=evals_result, 57 | ray_params=RayParams(max_actor_restarts=1, num_actors=1), 58 | num_boost_round=100, 59 | evals=[(dtrain, "train")], 60 | ) 61 | taken = time.time() - start 62 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 63 | 64 | bst.save_model("higgs.xgb") 65 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 66 | 67 | 68 | if __name__ == "__main__": 69 | import ray 70 | 71 | ray.init() 72 | 73 | start = time.time() 74 | main() 75 | taken = time.time() - start 76 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 77 | -------------------------------------------------------------------------------- /xgboost_ray/examples/higgs_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import pandas as pd 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | from higgs import download_higgs 8 | 9 | from xgboost_ray import RayDMatrix, RayParams, train 10 | 11 | FILENAME_CSV = "HIGGS.csv.gz" 12 | FILENAME_PARQUET = "HIGGS.parquet" 13 | 14 | 15 | def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs): 16 | if os.path.exists(out_file): 17 | return False 18 | 19 | print(f"Converting CSV {in_file} to PARQUET {out_file}") 20 | csv_stream = pd.read_csv( 21 | in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs 22 | ) 23 | 24 | parquet_schema = None 25 | parquet_writer = None 26 | for i, chunk in enumerate(csv_stream): 27 | print("Chunk", i) 28 | if not parquet_schema: 29 | # Guess the schema of the CSV file from the first chunk 30 | parquet_schema = pa.Table.from_pandas(df=chunk).schema 31 | # Open a Parquet file for writing 32 | parquet_writer = pq.ParquetWriter( 33 | out_file, parquet_schema, compression="snappy" 34 | ) 35 | # Write CSV chunk to the parquet file 36 | table = pa.Table.from_pandas(chunk, schema=parquet_schema) 37 | parquet_writer.write_table(table) 38 | 39 | parquet_writer.close() 40 | return True 41 | 42 | 43 | def main(): 44 | # Example adapted from this blog post: 45 | # https://medium.com/rapids-ai/a-new-official-dask-api-for-xgboost-e8b10f3d1eb7 46 | # This uses the HIGGS dataset. Download here: 47 | # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz 48 | 49 | if not os.path.exists(FILENAME_PARQUET): 50 | if not os.path.exists(FILENAME_CSV): 51 | download_higgs(FILENAME_CSV) 52 | print("Downloaded HIGGS csv dataset") 53 | print("Converting HIGGS csv dataset to parquet") 54 | csv_to_parquet( 55 | FILENAME_CSV, 56 | FILENAME_PARQUET, 57 | names=[ 58 | "label", 59 | "feature-01", 60 | "feature-02", 61 | "feature-03", 62 | "feature-04", 63 | "feature-05", 64 | "feature-06", 65 | "feature-07", 66 | "feature-08", 67 | "feature-09", 68 | "feature-10", 69 | "feature-11", 70 | "feature-12", 71 | "feature-13", 72 | "feature-14", 73 | "feature-15", 74 | "feature-16", 75 | "feature-17", 76 | "feature-18", 77 | "feature-19", 78 | "feature-20", 79 | "feature-21", 80 | "feature-22", 81 | "feature-23", 82 | "feature-24", 83 | "feature-25", 84 | "feature-26", 85 | "feature-27", 86 | "feature-28", 87 | ], 88 | ) 89 | 90 | colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] 91 | 92 | # Here we load the Parquet file 93 | dtrain = RayDMatrix( 94 | os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames 95 | ) 96 | 97 | config = { 98 | "tree_method": "hist", 99 | "eval_metric": ["logloss", "error"], 100 | } 101 | 102 | evals_result = {} 103 | 104 | start = time.time() 105 | bst = train( 106 | config, 107 | dtrain, 108 | evals_result=evals_result, 109 | ray_params=RayParams(max_actor_restarts=1, num_actors=1), 110 | num_boost_round=100, 111 | evals=[(dtrain, "train")], 112 | ) 113 | taken = time.time() - start 114 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 115 | 116 | bst.save_model("higgs.xgb") 117 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 118 | 119 | 120 | if __name__ == "__main__": 121 | import ray 122 | 123 | ray.init() 124 | 125 | start = time.time() 126 | main() 127 | taken = time.time() - start 128 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 129 | -------------------------------------------------------------------------------- /xgboost_ray/examples/readme.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa E501 2 | 3 | 4 | def readme_simple(): 5 | from sklearn.datasets import load_breast_cancer 6 | 7 | from xgboost_ray import RayDMatrix, RayParams, train 8 | 9 | train_x, train_y = load_breast_cancer(return_X_y=True) 10 | train_set = RayDMatrix(train_x, train_y) 11 | 12 | evals_result = {} 13 | bst = train( 14 | { 15 | "objective": "binary:logistic", 16 | "eval_metric": ["logloss", "error"], 17 | }, 18 | train_set, 19 | evals_result=evals_result, 20 | evals=[(train_set, "train")], 21 | verbose_eval=False, 22 | ray_params=RayParams(num_actors=2, cpus_per_actor=1), 23 | ) 24 | 25 | bst.save_model("model.xgb") 26 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 27 | 28 | 29 | def readme_predict(): 30 | import xgboost as xgb 31 | from sklearn.datasets import load_breast_cancer 32 | 33 | from xgboost_ray import RayDMatrix, RayParams, predict 34 | 35 | data, labels = load_breast_cancer(return_X_y=True) 36 | 37 | dpred = RayDMatrix(data, labels) 38 | 39 | bst = xgb.Booster(model_file="model.xgb") 40 | pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=2)) 41 | 42 | print(pred_ray) 43 | 44 | 45 | def readme_tune(): 46 | from sklearn.datasets import load_breast_cancer 47 | 48 | from xgboost_ray import RayDMatrix, RayParams, train 49 | 50 | num_actors = 4 51 | num_cpus_per_actor = 1 52 | 53 | ray_params = RayParams(num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) 54 | 55 | def train_model(config): 56 | train_x, train_y = load_breast_cancer(return_X_y=True) 57 | train_set = RayDMatrix(train_x, train_y) 58 | 59 | evals_result = {} 60 | bst = train( 61 | params=config, 62 | dtrain=train_set, 63 | evals_result=evals_result, 64 | evals=[(train_set, "train")], 65 | verbose_eval=False, 66 | ray_params=ray_params, 67 | ) 68 | bst.save_model("model.xgb") 69 | 70 | from ray import tune 71 | 72 | # Specify the hyperparameter search space. 73 | config = { 74 | "tree_method": "approx", 75 | "objective": "binary:logistic", 76 | "eval_metric": ["logloss", "error"], 77 | "eta": tune.loguniform(1e-4, 1e-1), 78 | "subsample": tune.uniform(0.5, 1.0), 79 | "max_depth": tune.randint(1, 9), 80 | } 81 | 82 | # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` 83 | analysis = tune.run( 84 | train_model, 85 | config=config, 86 | metric="train-error", 87 | mode="min", 88 | num_samples=4, 89 | resources_per_trial=ray_params.get_tune_resources(), 90 | ) 91 | print("Best hyperparameters", analysis.best_config) 92 | 93 | 94 | if __name__ == "__main__": 95 | import ray 96 | 97 | ray.init(num_cpus=5) 98 | 99 | print("Readme: Simple example") 100 | readme_simple() 101 | readme_predict() 102 | try: 103 | print("Readme: Ray Tune example") 104 | readme_tune() 105 | except ImportError: 106 | print("Ray Tune not installed.") 107 | -------------------------------------------------------------------------------- /xgboost_ray/examples/readme_sklearn_api.py: -------------------------------------------------------------------------------- 1 | def readme_sklearn_api(): 2 | from sklearn.datasets import load_breast_cancer 3 | from sklearn.model_selection import train_test_split 4 | 5 | from xgboost_ray import RayParams, RayXGBClassifier 6 | 7 | seed = 42 8 | 9 | X, y = load_breast_cancer(return_X_y=True) 10 | X_train, X_test, y_train, y_test = train_test_split( 11 | X, y, train_size=0.25, random_state=42 12 | ) 13 | 14 | clf = RayXGBClassifier( 15 | n_jobs=4, random_state=seed # In XGBoost-Ray, n_jobs sets the number of actors 16 | ) 17 | 18 | # scikit-learn API will automatically conver the data 19 | # to RayDMatrix format as needed. 20 | # You can also pass X as a RayDMatrix, in which case 21 | # y will be ignored. 22 | 23 | clf.fit(X_train, y_train) 24 | 25 | pred_ray = clf.predict(X_test) 26 | print(pred_ray) 27 | 28 | pred_proba_ray = clf.predict_proba(X_test) 29 | print(pred_proba_ray) 30 | 31 | # It is also possible to pass a RayParams object 32 | # to fit/predict/predict_proba methods - will override 33 | # n_jobs set during initialization 34 | 35 | clf.fit(X_train, y_train, ray_params=RayParams(num_actors=2)) 36 | 37 | pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2)) 38 | print(pred_ray) 39 | 40 | 41 | if __name__ == "__main__": 42 | import ray 43 | 44 | ray.init(num_cpus=5) 45 | 46 | print("Readme: scikit-learn API example") 47 | readme_sklearn_api() 48 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import ray 4 | from sklearn import datasets 5 | from sklearn.model_selection import train_test_split 6 | 7 | from xgboost_ray import RayDMatrix, RayParams, train 8 | 9 | 10 | def main(cpus_per_actor, num_actors): 11 | # Load dataset 12 | data, labels = datasets.load_breast_cancer(return_X_y=True) 13 | # Split into train and test set 14 | train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) 15 | 16 | train_set = RayDMatrix(train_x, train_y) 17 | test_set = RayDMatrix(test_x, test_y) 18 | 19 | evals_result = {} 20 | 21 | # Set XGBoost config. 22 | xgboost_params = { 23 | "tree_method": "approx", 24 | "objective": "binary:logistic", 25 | "eval_metric": ["logloss", "error"], 26 | } 27 | 28 | # Train the classifier 29 | bst = train( 30 | params=xgboost_params, 31 | dtrain=train_set, 32 | evals=[(test_set, "eval")], 33 | evals_result=evals_result, 34 | ray_params=RayParams( 35 | max_actor_restarts=0, 36 | gpus_per_actor=0, 37 | cpus_per_actor=cpus_per_actor, 38 | num_actors=num_actors, 39 | ), 40 | verbose_eval=False, 41 | num_boost_round=10, 42 | ) 43 | 44 | model_path = "simple.xgb" 45 | bst.save_model(model_path) 46 | print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1])) 47 | 48 | 49 | if __name__ == "__main__": 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument( 52 | "--address", required=False, type=str, help="the address to use for Ray" 53 | ) 54 | parser.add_argument( 55 | "--server-address", 56 | required=False, 57 | type=str, 58 | help="Address of the remote server if using Ray Client.", 59 | ) 60 | parser.add_argument( 61 | "--cpus-per-actor", 62 | type=int, 63 | default=1, 64 | help="Sets number of CPUs per xgboost training worker.", 65 | ) 66 | parser.add_argument( 67 | "--num-actors", 68 | type=int, 69 | default=4, 70 | help="Sets number of xgboost workers to use.", 71 | ) 72 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 73 | 74 | args, _ = parser.parse_known_args() 75 | 76 | if args.smoke_test: 77 | ray.init(num_cpus=args.num_actors) 78 | elif args.server_address: 79 | ray.util.connect(args.server_address) 80 | else: 81 | ray.init(address=args.address) 82 | 83 | main(args.cpus_per_actor, args.num_actors) 84 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_dask.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | 7 | from xgboost_ray import RayDMatrix, RayParams, train 8 | from xgboost_ray.data_sources.dask import DASK_INSTALLED 9 | 10 | 11 | def main(cpus_per_actor, num_actors): 12 | if not DASK_INSTALLED: 13 | print("Dask is not installed. Install with `pip install dask`") 14 | return 15 | 16 | # Local import so the installation check comes first 17 | import dask 18 | import dask.dataframe as dd 19 | from ray.util.dask import ray_dask_get 20 | 21 | dask.config.set(scheduler=ray_dask_get) 22 | 23 | # Generate dataset 24 | x = np.repeat(range(8), 16).reshape((32, 4)) 25 | # Even numbers --> 0, odd numbers --> 1 26 | y = np.tile(np.repeat(range(2), 4), 4) 27 | 28 | # Flip some bits to reduce max accuracy 29 | bits_to_flip = np.random.choice(32, size=6, replace=False) 30 | y[bits_to_flip] = 1 - y[bits_to_flip] 31 | 32 | data = pd.DataFrame(x) 33 | data["label"] = y 34 | 35 | # Split into 4 partitions 36 | dask_df = dd.from_pandas(data, npartitions=4) 37 | 38 | train_set = RayDMatrix(dask_df, "label") 39 | 40 | evals_result = {} 41 | # Set XGBoost config. 42 | xgboost_params = { 43 | "tree_method": "approx", 44 | "objective": "binary:logistic", 45 | "eval_metric": ["logloss", "error"], 46 | } 47 | 48 | # Train the classifier 49 | bst = train( 50 | params=xgboost_params, 51 | dtrain=train_set, 52 | evals=[(train_set, "train")], 53 | evals_result=evals_result, 54 | ray_params=RayParams( 55 | max_actor_restarts=0, 56 | gpus_per_actor=0, 57 | cpus_per_actor=cpus_per_actor, 58 | num_actors=num_actors, 59 | ), 60 | verbose_eval=False, 61 | num_boost_round=10, 62 | ) 63 | 64 | model_path = "dask.xgb" 65 | bst.save_model(model_path) 66 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument( 72 | "--address", required=False, type=str, help="the address to use for Ray" 73 | ) 74 | parser.add_argument( 75 | "--server-address", 76 | required=False, 77 | type=str, 78 | help="Address of the remote server if using Ray Client.", 79 | ) 80 | parser.add_argument( 81 | "--cpus-per-actor", 82 | type=int, 83 | default=1, 84 | help="Sets number of CPUs per xgboost training worker.", 85 | ) 86 | parser.add_argument( 87 | "--num-actors", 88 | type=int, 89 | default=4, 90 | help="Sets number of xgboost workers to use.", 91 | ) 92 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 93 | 94 | args, _ = parser.parse_known_args() 95 | 96 | if args.smoke_test: 97 | ray.init(num_cpus=args.num_actors + 1) 98 | elif args.server_address: 99 | ray.util.connect(args.server_address) 100 | else: 101 | ray.init(address=args.address) 102 | 103 | main(args.cpus_per_actor, args.num_actors) 104 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_modin.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | 7 | from xgboost_ray import RayDMatrix, RayParams, train 8 | from xgboost_ray.data_sources.modin import MODIN_INSTALLED 9 | 10 | 11 | def main(cpus_per_actor, num_actors): 12 | if not MODIN_INSTALLED: 13 | print( 14 | "Modin is not installed or installed in a version that is not " 15 | "compatible with xgboost_ray (< 0.9.0)." 16 | ) 17 | return 18 | 19 | # Import modin after initializing Ray 20 | from modin.distributed.dataframe.pandas import from_partitions 21 | 22 | # Generate dataset 23 | x = np.repeat(range(8), 16).reshape((32, 4)) 24 | # Even numbers --> 0, odd numbers --> 1 25 | y = np.tile(np.repeat(range(2), 4), 4) 26 | 27 | # Flip some bits to reduce max accuracy 28 | bits_to_flip = np.random.choice(32, size=6, replace=False) 29 | y[bits_to_flip] = 1 - y[bits_to_flip] 30 | 31 | data = pd.DataFrame(x) 32 | data["label"] = y 33 | 34 | # Split into 4 partitions 35 | partitions = [ray.put(part) for part in np.split(data, 4)] 36 | 37 | # Create modin df here 38 | modin_df = from_partitions(partitions, axis=0) 39 | 40 | train_set = RayDMatrix(modin_df, "label") 41 | 42 | evals_result = {} 43 | # Set XGBoost config. 44 | xgboost_params = { 45 | "tree_method": "approx", 46 | "objective": "binary:logistic", 47 | "eval_metric": ["logloss", "error"], 48 | } 49 | 50 | # Train the classifier 51 | bst = train( 52 | params=xgboost_params, 53 | dtrain=train_set, 54 | evals=[(train_set, "train")], 55 | evals_result=evals_result, 56 | ray_params=RayParams( 57 | max_actor_restarts=0, 58 | gpus_per_actor=0, 59 | cpus_per_actor=cpus_per_actor, 60 | num_actors=num_actors, 61 | ), 62 | verbose_eval=False, 63 | num_boost_round=10, 64 | ) 65 | 66 | model_path = "modin.xgb" 67 | bst.save_model(model_path) 68 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 69 | 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument( 74 | "--address", required=False, type=str, help="the address to use for Ray" 75 | ) 76 | parser.add_argument( 77 | "--server-address", 78 | required=False, 79 | type=str, 80 | help="Address of the remote server if using Ray Client.", 81 | ) 82 | parser.add_argument( 83 | "--cpus-per-actor", 84 | type=int, 85 | default=1, 86 | help="Sets number of CPUs per xgboost training worker.", 87 | ) 88 | parser.add_argument( 89 | "--num-actors", 90 | type=int, 91 | default=4, 92 | help="Sets number of xgboost workers to use.", 93 | ) 94 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 95 | 96 | args, _ = parser.parse_known_args() 97 | 98 | if args.smoke_test: 99 | ray.init(num_cpus=args.num_actors + 1) 100 | elif args.server_address: 101 | ray.util.connect(args.server_address) 102 | else: 103 | ray.init(address=args.address) 104 | 105 | main(args.cpus_per_actor, args.num_actors) 106 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_objectstore.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | 7 | from xgboost_ray import RayDMatrix, RayParams, train 8 | 9 | 10 | def main(cpus_per_actor, num_actors): 11 | # Generate dataset 12 | x = np.repeat(range(8), 16).reshape((32, 4)) 13 | # Even numbers --> 0, odd numbers --> 1 14 | y = np.tile(np.repeat(range(2), 4), 4) 15 | 16 | # Flip some bits to reduce max accuracy 17 | bits_to_flip = np.random.choice(32, size=6, replace=False) 18 | y[bits_to_flip] = 1 - y[bits_to_flip] 19 | 20 | data = pd.DataFrame(x) 21 | data["label"] = y 22 | 23 | # Split into 4 partitions 24 | partitions = [ray.put(part) for part in np.split(data, 4)] 25 | 26 | train_set = RayDMatrix(partitions, "label") 27 | 28 | evals_result = {} 29 | # Set XGBoost config. 30 | xgboost_params = { 31 | "tree_method": "approx", 32 | "objective": "binary:logistic", 33 | "eval_metric": ["logloss", "error"], 34 | } 35 | 36 | # Train the classifier 37 | bst = train( 38 | params=xgboost_params, 39 | dtrain=train_set, 40 | evals=[(train_set, "train")], 41 | evals_result=evals_result, 42 | ray_params=RayParams( 43 | max_actor_restarts=0, 44 | gpus_per_actor=0, 45 | cpus_per_actor=cpus_per_actor, 46 | num_actors=num_actors, 47 | ), 48 | verbose_eval=False, 49 | num_boost_round=10, 50 | ) 51 | 52 | model_path = "modin.xgb" 53 | bst.save_model(model_path) 54 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument( 60 | "--address", required=False, type=str, help="the address to use for Ray" 61 | ) 62 | parser.add_argument( 63 | "--server-address", 64 | required=False, 65 | type=str, 66 | help="Address of the remote server if using Ray Client.", 67 | ) 68 | parser.add_argument( 69 | "--cpus-per-actor", 70 | type=int, 71 | default=1, 72 | help="Sets number of CPUs per xgboost training worker.", 73 | ) 74 | parser.add_argument( 75 | "--num-actors", 76 | type=int, 77 | default=4, 78 | help="Sets number of xgboost workers to use.", 79 | ) 80 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 81 | 82 | args, _ = parser.parse_known_args() 83 | 84 | if args.smoke_test: 85 | ray.init(num_cpus=args.num_actors + 1) 86 | elif args.server_address: 87 | ray.util.connect(args.server_address) 88 | else: 89 | ray.init(address=args.address) 90 | 91 | main(args.cpus_per_actor, args.num_actors) 92 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_partitioned.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import ray 5 | from sklearn import datasets 6 | from sklearn.model_selection import train_test_split 7 | 8 | from xgboost_ray import RayDMatrix, RayParams, train 9 | 10 | nc = 31 11 | 12 | 13 | @ray.remote 14 | class AnActor: 15 | """We mimic a distributed DF by having several actors create 16 | data which form the global DF. 17 | """ 18 | 19 | @ray.method(num_returns=2) 20 | def genData(self, rank, nranks, nrows): 21 | """Generate global dataset and cut out local piece. 22 | In real life each actor would of course directly create local data. 23 | """ 24 | # Load dataset 25 | data, labels = datasets.load_breast_cancer(return_X_y=True) 26 | # Split into train and test set 27 | train_x, _, train_y, _ = train_test_split(data, labels, test_size=0.25) 28 | train_y = train_y.reshape((train_y.shape[0], 1)) 29 | train = np.hstack([train_x, train_y]) 30 | assert nrows <= train.shape[0] 31 | assert nc == train.shape[1] 32 | sz = nrows // nranks 33 | return train[sz * rank : sz * (rank + 1)], ray.util.get_node_ip_address() 34 | 35 | 36 | class Parted: 37 | """Class exposing __partitioned__""" 38 | 39 | def __init__(self, parted): 40 | self.__partitioned__ = parted 41 | 42 | 43 | def main(cpus_per_actor, num_actors): 44 | nr = 424 45 | actors = [AnActor.remote() for _ in range(num_actors)] 46 | parts = [actors[i].genData.remote(i, num_actors, nr) for i in range(num_actors)] 47 | rowsperpart = nr // num_actors 48 | nr = rowsperpart * num_actors 49 | parted = Parted( 50 | { 51 | "shape": (nr, nc), 52 | "partition_tiling": (num_actors, 1), 53 | "get": lambda x: ray.get(x), 54 | "partitions": { 55 | (i, 0): { 56 | "start": (i * rowsperpart, 0), 57 | "shape": (rowsperpart, nc), 58 | "data": parts[i][0], 59 | "location": [ray.get(parts[i][1])], 60 | } 61 | for i in range(num_actors) 62 | }, 63 | } 64 | ) 65 | 66 | yl = nc - 1 67 | # Let's create DMatrix from our __partitioned__ structure 68 | train_set = RayDMatrix(parted, f"f{yl}") 69 | 70 | evals_result = {} 71 | # Set XGBoost config. 72 | xgboost_params = { 73 | "tree_method": "approx", 74 | "objective": "binary:logistic", 75 | "eval_metric": ["logloss", "error"], 76 | } 77 | 78 | # Train the classifier 79 | bst = train( 80 | params=xgboost_params, 81 | dtrain=train_set, 82 | evals=[(train_set, "train")], 83 | evals_result=evals_result, 84 | ray_params=RayParams( 85 | max_actor_restarts=0, 86 | gpus_per_actor=0, 87 | cpus_per_actor=cpus_per_actor, 88 | num_actors=num_actors, 89 | ), 90 | verbose_eval=False, 91 | num_boost_round=10, 92 | ) 93 | 94 | model_path = "partitioned.xgb" 95 | bst.save_model(model_path) 96 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 97 | 98 | 99 | if __name__ == "__main__": 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument( 102 | "--address", required=False, type=str, help="the address to use for Ray" 103 | ) 104 | parser.add_argument( 105 | "--server-address", 106 | required=False, 107 | type=str, 108 | help="Address of the remote server if using Ray Client.", 109 | ) 110 | parser.add_argument( 111 | "--cpus-per-actor", 112 | type=int, 113 | default=1, 114 | help="Sets number of CPUs per xgboost training worker.", 115 | ) 116 | parser.add_argument( 117 | "--num-actors", 118 | type=int, 119 | default=4, 120 | help="Sets number of xgboost workers to use.", 121 | ) 122 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 123 | 124 | args, _ = parser.parse_known_args() 125 | 126 | if not ray.is_initialized(): 127 | if args.smoke_test: 128 | ray.init(num_cpus=args.num_actors + 1) 129 | elif args.server_address: 130 | ray.util.connect(args.server_address) 131 | else: 132 | ray.init(address=args.address) 133 | 134 | main(args.cpus_per_actor, args.num_actors) 135 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import xgboost as xgb 5 | from sklearn import datasets 6 | 7 | from xgboost_ray import RayDMatrix, RayParams, predict 8 | 9 | 10 | def main(): 11 | if not os.path.exists("simple.xgb"): 12 | raise ValueError( 13 | "Model file not found: `simple.xgb`" 14 | "\nFIX THIS by running `python `simple.py` first to " 15 | "train the model." 16 | ) 17 | 18 | # Load dataset 19 | data, labels = datasets.load_breast_cancer(return_X_y=True) 20 | 21 | dmat_xgb = xgb.DMatrix(data, labels) 22 | dmat_ray = RayDMatrix(data, labels) 23 | 24 | bst = xgb.Booster(model_file="simple.xgb") 25 | 26 | pred_xgb = bst.predict(dmat_xgb) 27 | pred_ray = predict(bst, dmat_ray, ray_params=RayParams(num_actors=2)) 28 | 29 | np.testing.assert_array_equal(pred_xgb, pred_ray) 30 | print(pred_ray) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_ray_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import ray 6 | from xgboost import DMatrix 7 | 8 | from xgboost_ray import RayDMatrix, RayParams, train 9 | 10 | 11 | def main(cpus_per_actor, num_actors): 12 | np.random.seed(1234) 13 | # Generate dataset 14 | x = np.repeat(range(8), 16).reshape((32, 4)) 15 | # Even numbers --> 0, odd numbers --> 1 16 | y = np.tile(np.repeat(range(2), 4), 4) 17 | 18 | # Flip some bits to reduce max accuracy 19 | bits_to_flip = np.random.choice(32, size=6, replace=False) 20 | y[bits_to_flip] = 1 - y[bits_to_flip] 21 | 22 | data = pd.DataFrame(x) 23 | # Ray Datasets require all columns to be string 24 | data.columns = [str(c) for c in data.columns] 25 | data["label"] = y 26 | 27 | ray_ds = ray.data.from_pandas(data) 28 | train_set = RayDMatrix(ray_ds, "label") 29 | 30 | evals_result = {} 31 | # Set XGBoost config. 32 | xgboost_params = { 33 | "tree_method": "approx", 34 | "objective": "binary:logistic", 35 | "eval_metric": ["logloss", "error"], 36 | } 37 | 38 | # Train the classifier 39 | bst = train( 40 | params=xgboost_params, 41 | dtrain=train_set, 42 | evals=[(train_set, "train")], 43 | evals_result=evals_result, 44 | ray_params=RayParams( 45 | max_actor_restarts=0, 46 | gpus_per_actor=0, 47 | cpus_per_actor=cpus_per_actor, 48 | num_actors=num_actors, 49 | ), 50 | verbose_eval=False, 51 | num_boost_round=10, 52 | ) 53 | 54 | model_path = "ray_datasets.xgb" 55 | bst.save_model(model_path) 56 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 57 | 58 | # Distributed prediction 59 | scored = ray_ds.drop_columns(["label"]).map_batches( 60 | lambda batch: {"pred": bst.predict(DMatrix(batch))}, batch_format="pandas" 61 | ) 62 | print(scored.to_pandas()) 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument( 68 | "--address", required=False, type=str, help="the address to use for Ray" 69 | ) 70 | parser.add_argument( 71 | "--server-address", 72 | required=False, 73 | type=str, 74 | help="Address of the remote server if using Ray Client.", 75 | ) 76 | parser.add_argument( 77 | "--cpus-per-actor", 78 | type=int, 79 | default=1, 80 | help="Sets number of CPUs per xgboost training worker.", 81 | ) 82 | parser.add_argument( 83 | "--num-actors", 84 | type=int, 85 | default=4, 86 | help="Sets number of xgboost workers to use.", 87 | ) 88 | parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") 89 | 90 | args, _ = parser.parse_known_args() 91 | 92 | if args.smoke_test: 93 | ray.init(num_cpus=args.num_actors + 1) 94 | elif args.server_address: 95 | ray.util.connect(args.server_address) 96 | else: 97 | ray.init(address=args.address) 98 | 99 | main(args.cpus_per_actor, args.num_actors) 100 | -------------------------------------------------------------------------------- /xgboost_ray/examples/simple_tune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import ray 5 | from ray import tune 6 | from sklearn import datasets 7 | from sklearn.model_selection import train_test_split 8 | 9 | import xgboost_ray 10 | from xgboost_ray import RayDMatrix, RayParams, train 11 | 12 | 13 | def train_breast_cancer(config, ray_params): 14 | # Load dataset 15 | data, labels = datasets.load_breast_cancer(return_X_y=True) 16 | # Split into train and test set 17 | train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) 18 | 19 | train_set = RayDMatrix(train_x, train_y) 20 | test_set = RayDMatrix(test_x, test_y) 21 | 22 | evals_result = {} 23 | 24 | bst = train( 25 | params=config, 26 | dtrain=train_set, 27 | evals=[(test_set, "eval")], 28 | evals_result=evals_result, 29 | ray_params=ray_params, 30 | verbose_eval=False, 31 | num_boost_round=10, 32 | ) 33 | 34 | model_path = "tuned.xgb" 35 | bst.save_model(model_path) 36 | print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1])) 37 | 38 | 39 | def main(cpus_per_actor, num_actors, num_samples): 40 | # Set XGBoost config. 41 | config = { 42 | "tree_method": "approx", 43 | "objective": "binary:logistic", 44 | "eval_metric": ["logloss", "error"], 45 | "eta": tune.loguniform(1e-4, 1e-1), 46 | "subsample": tune.uniform(0.5, 1.0), 47 | "max_depth": tune.randint(1, 9), 48 | } 49 | 50 | ray_params = RayParams( 51 | max_actor_restarts=1, 52 | gpus_per_actor=0, 53 | cpus_per_actor=cpus_per_actor, 54 | num_actors=num_actors, 55 | ) 56 | 57 | analysis = tune.run( 58 | tune.with_parameters(train_breast_cancer, ray_params=ray_params), 59 | # Use the `get_tune_resources` helper function to set the resources. 60 | resources_per_trial=ray_params.get_tune_resources(), 61 | config=config, 62 | num_samples=num_samples, 63 | metric="eval-error", 64 | mode="min", 65 | ) 66 | 67 | # Load the best model checkpoint. 68 | best_bst = xgboost_ray.tune.load_model( 69 | os.path.join(analysis.best_trial.local_path, "tuned.xgb") 70 | ) 71 | 72 | best_bst.save_model("best_model.xgb") 73 | 74 | accuracy = 1.0 - analysis.best_result["eval-error"] 75 | print(f"Best model parameters: {analysis.best_config}") 76 | print(f"Best model total accuracy: {accuracy:.4f}") 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument( 82 | "--address", required=False, type=str, help="the address to use for Ray" 83 | ) 84 | parser.add_argument( 85 | "--server-address", 86 | required=False, 87 | type=str, 88 | help="Address of the remote server if using Ray Client.", 89 | ) 90 | parser.add_argument( 91 | "--cpus-per-actor", 92 | type=int, 93 | default=1, 94 | help="Sets number of CPUs per XGBoost training worker.", 95 | ) 96 | parser.add_argument( 97 | "--num-actors", 98 | type=int, 99 | default=1, 100 | help="Sets number of XGBoost workers to use.", 101 | ) 102 | parser.add_argument( 103 | "--num-samples", type=int, default=4, help="Number of samples to use for Tune." 104 | ) 105 | parser.add_argument("--smoke-test", action="store_true", default=False) 106 | 107 | args, _ = parser.parse_known_args() 108 | 109 | if args.smoke_test: 110 | ray.init(num_cpus=args.num_actors * args.num_samples) 111 | elif args.server_address: 112 | ray.util.connect(args.server_address) 113 | else: 114 | ray.init(address=args.address) 115 | 116 | main(args.cpus_per_actor, args.num_actors, args.num_samples) 117 | -------------------------------------------------------------------------------- /xgboost_ray/examples/train_on_test_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time 5 | 6 | from xgboost_ray import RayDMatrix, RayParams, train 7 | from xgboost_ray.tests.utils import create_parquet_in_tempdir 8 | 9 | #### 10 | # Run `create_test_data.py` first to create a large fake data set. 11 | # Alternatively, run with `--smoke-test` to create an ephemeral small fake 12 | # data set. 13 | #### 14 | 15 | 16 | def main(fname, num_actors=2): 17 | dtrain = RayDMatrix(os.path.abspath(fname), label="labels", ignore=["partition"]) 18 | 19 | config = { 20 | "tree_method": "hist", 21 | "eval_metric": ["logloss", "error"], 22 | } 23 | 24 | evals_result = {} 25 | 26 | start = time.time() 27 | bst = train( 28 | config, 29 | dtrain, 30 | evals_result=evals_result, 31 | ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), 32 | num_boost_round=10, 33 | evals=[(dtrain, "train")], 34 | ) 35 | taken = time.time() - start 36 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 37 | 38 | bst.save_model("test_data.xgb") 39 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "--smoke-test", 46 | action="store_true", 47 | default=False, 48 | help="Finish quickly for testing", 49 | ) 50 | args = parser.parse_args() 51 | 52 | temp_dir, path = None, None 53 | if args.smoke_test: 54 | temp_dir, path = create_parquet_in_tempdir( 55 | "smoketest.parquet", 56 | num_rows=1_000, 57 | num_features=4, 58 | num_classes=2, 59 | num_partitions=2, 60 | ) 61 | else: 62 | path = os.path.join(os.path.dirname(__file__), "parted.parquet") 63 | 64 | import ray 65 | 66 | ray.init() 67 | 68 | start = time.time() 69 | main(path) 70 | taken = time.time() - start 71 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 72 | 73 | if args.smoke_test: 74 | shutil.rmtree(temp_dir) 75 | -------------------------------------------------------------------------------- /xgboost_ray/examples/train_with_ml_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import time 5 | 6 | from ray.util.data import read_parquet 7 | 8 | from xgboost_ray import RayDMatrix, RayParams, train 9 | from xgboost_ray.tests.utils import create_parquet_in_tempdir 10 | 11 | #### 12 | # Run `create_test_data.py` first to create a large fake data set. 13 | # Alternatively, run with `--smoke-test` to create an ephemeral small fake 14 | # data set. 15 | #### 16 | 17 | 18 | def main(fname, num_actors=2): 19 | ml_dataset = read_parquet(fname, num_shards=num_actors) 20 | 21 | dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"]) 22 | 23 | config = { 24 | "tree_method": "hist", 25 | "eval_metric": ["logloss", "error"], 26 | } 27 | 28 | evals_result = {} 29 | 30 | start = time.time() 31 | bst = train( 32 | config, 33 | dtrain, 34 | evals_result=evals_result, 35 | ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), 36 | num_boost_round=10, 37 | evals=[(dtrain, "train")], 38 | ) 39 | taken = time.time() - start 40 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 41 | 42 | bst.save_model("test_data.xgb") 43 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument( 49 | "--smoke-test", 50 | action="store_true", 51 | default=False, 52 | help="Finish quickly for testing", 53 | ) 54 | args = parser.parse_args() 55 | 56 | temp_dir, path = None, None 57 | if args.smoke_test: 58 | temp_dir, path = create_parquet_in_tempdir( 59 | "smoketest.parquet", 60 | num_rows=1_000, 61 | num_features=4, 62 | num_classes=2, 63 | num_partitions=2, 64 | ) 65 | else: 66 | path = os.path.join(os.path.dirname(__file__), "parted.parquet") 67 | 68 | import ray 69 | 70 | ray.init() 71 | 72 | start = time.time() 73 | main(path) 74 | taken = time.time() - start 75 | print(f"TOTAL TIME TAKEN: {taken:.2f} seconds") 76 | 77 | if args.smoke_test: 78 | shutil.rmtree(temp_dir) 79 | -------------------------------------------------------------------------------- /xgboost_ray/session.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from ray.util.annotations import DeveloperAPI, PublicAPI 4 | from ray.util.queue import Queue 5 | 6 | 7 | @DeveloperAPI 8 | class RayXGBoostSession: 9 | def __init__(self, rank: int, queue: Optional[Queue]): 10 | self._rank = rank 11 | self._queue = queue 12 | 13 | def get_actor_rank(self): 14 | return self._rank 15 | 16 | def set_queue(self, queue): 17 | self._queue = queue 18 | 19 | def put_queue(self, item): 20 | if self._queue is None: 21 | raise ValueError( 22 | "Trying to put something into session queue, but queue " 23 | "was not initialized. This is probably a bug, please raise " 24 | "an issue at https://github.com/ray-project/xgboost_ray" 25 | ) 26 | self._queue.put((self._rank, item)) 27 | 28 | 29 | _session = None 30 | 31 | 32 | @DeveloperAPI 33 | def init_session(*args, **kwargs): 34 | global _session 35 | if _session: 36 | raise ValueError( 37 | "Trying to initialize RayXGBoostSession twice." 38 | "\nFIX THIS by not calling `init_session()` manually." 39 | ) 40 | _session = RayXGBoostSession(*args, **kwargs) 41 | 42 | 43 | @DeveloperAPI 44 | def get_session() -> RayXGBoostSession: 45 | global _session 46 | if not _session or not isinstance(_session, RayXGBoostSession): 47 | raise ValueError( 48 | "Trying to access RayXGBoostSession from outside an XGBoost run." 49 | "\nFIX THIS by calling function in `session.py` like " 50 | "`get_actor_rank()` only from within an XGBoost actor session." 51 | ) 52 | return _session 53 | 54 | 55 | @DeveloperAPI 56 | def set_session_queue(queue: Queue): 57 | session = get_session() 58 | session.set_queue(queue) 59 | 60 | 61 | @PublicAPI 62 | def get_actor_rank() -> int: 63 | session = get_session() 64 | return session.get_actor_rank() 65 | 66 | 67 | @PublicAPI 68 | def get_rabit_rank() -> int: 69 | import xgboost as xgb 70 | 71 | try: 72 | # From xgboost>=1.7.0, rabit is replaced by a collective communicator 73 | return xgb.collective.get_rank() 74 | except (ImportError, AttributeError): 75 | return xgb.rabit.get_rank() 76 | 77 | 78 | @PublicAPI 79 | def put_queue(*args, **kwargs): 80 | session = get_session() 81 | session.put_queue(*args, **kwargs) 82 | -------------------------------------------------------------------------------- /xgboost_ray/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ray-project/xgboost_ray/e9049256575e5bdd956b369cf86e94a298d11048/xgboost_ray/tests/__init__.py -------------------------------------------------------------------------------- /xgboost_ray/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from functools import partial 3 | 4 | import pytest 5 | import ray 6 | 7 | try: 8 | # Ray 1.3+ 9 | from ray._private.cluster_utils import Cluster 10 | except ImportError: 11 | from ray.cluster_utils import Cluster 12 | 13 | 14 | def get_default_fixure_system_config(): 15 | system_config = { 16 | "object_timeout_milliseconds": 200, 17 | "health_check_initial_delay_ms": 0, 18 | "health_check_failure_threshold": 10, 19 | "object_store_full_delay_ms": 100, 20 | } 21 | return system_config 22 | 23 | 24 | def get_default_fixture_ray_kwargs(): 25 | system_config = get_default_fixure_system_config() 26 | ray_kwargs = { 27 | "num_cpus": 1, 28 | "object_store_memory": 150 * 1024 * 1024, 29 | "dashboard_port": None, 30 | "namespace": "default_test_namespace", 31 | "_system_config": system_config, 32 | } 33 | return ray_kwargs 34 | 35 | 36 | @contextmanager 37 | def _ray_start_cluster(**kwargs): 38 | init_kwargs = get_default_fixture_ray_kwargs() 39 | num_nodes = 0 40 | do_init = False 41 | # num_nodes & do_init are not arguments for ray.init, so delete them. 42 | if "num_nodes" in kwargs: 43 | num_nodes = kwargs["num_nodes"] 44 | del kwargs["num_nodes"] 45 | if "do_init" in kwargs: 46 | do_init = kwargs["do_init"] 47 | del kwargs["do_init"] 48 | elif num_nodes > 0: 49 | do_init = True 50 | init_kwargs.update(kwargs) 51 | cluster = Cluster() 52 | remote_nodes = [] 53 | for i in range(num_nodes): 54 | if i > 0 and "_system_config" in init_kwargs: 55 | del init_kwargs["_system_config"] 56 | remote_nodes.append(cluster.add_node(**init_kwargs)) 57 | # We assume driver will connect to the head (first node), 58 | # so ray init will be invoked if do_init is true 59 | if len(remote_nodes) == 1 and do_init: 60 | ray.init(address=cluster.address) 61 | yield cluster 62 | # The code after the yield will run as teardown code. 63 | ray.shutdown() 64 | cluster.shutdown() 65 | 66 | 67 | # This fixture will start a cluster with empty nodes. 68 | @pytest.fixture(scope="function") 69 | def ray_start_cluster(request): 70 | param = getattr(request, "param", {}) 71 | request.cls.ray_start_cluster = partial(_ray_start_cluster, **param) 72 | -------------------------------------------------------------------------------- /xgboost_ray/tests/env_info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2005 3 | 4 | echo "Test environment information" 5 | echo "----------------------------" 6 | echo "Python version: $(python --version 2>/dev/null || echo 'Python not installed')" 7 | echo "Ray version: $(ray --version 2>/dev/null || echo 'Ray not installed')" 8 | echo "Installed pip packages:" 9 | echo "$(python -m pip freeze 2>/dev/null || echo 'Pip not installed')" 10 | echo "----------------------------" 11 | -------------------------------------------------------------------------------- /xgboost_ray/tests/fault_tolerance.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from collections import defaultdict 4 | from typing import Dict, Set, Tuple 5 | 6 | import ray 7 | from ray.actor import ActorHandle 8 | 9 | from xgboost_ray.callback import DistributedCallback 10 | from xgboost_ray.compat import TrainingCallback 11 | from xgboost_ray.session import get_actor_rank 12 | 13 | 14 | @ray.remote(num_cpus=0) 15 | class FaultToleranceManager: 16 | def __init__(self, start_boost_round: int = 0): 17 | self.global_boost_round = start_boost_round 18 | 19 | # Dict from boost_round -> actor ranks to die 20 | self.scheduled_kill: Dict[int, Set[int]] = defaultdict(set) 21 | 22 | # Dict from actor rank -> starts/ends of boost rounds to sleep 23 | self.delayed_return: Dict[int, Set[Tuple[int, int]]] = defaultdict(set) 24 | 25 | # List of tuples (global_boost_round, actor_boost_round) to log 26 | # actor iterations 27 | self.training_logs = defaultdict(list) 28 | 29 | def schedule_kill(self, rank: int, boost_round: int): 30 | """Kill an actor when reaching this global boost round""" 31 | self.scheduled_kill[boost_round].add(rank) 32 | 33 | def delay_return(self, rank: int, start_boost_round: int, end_boost_round: int): 34 | """Do not allow an actor to finish data loading between these rounds""" 35 | self.delayed_return[rank].add((start_boost_round, end_boost_round)) 36 | 37 | def inc_boost_round(self, rank: int): 38 | """Increase global boosting round""" 39 | if rank == 0: 40 | self.global_boost_round += 1 41 | 42 | def log_iteration(self, rank: int, boost_round: int): 43 | """Log iteration""" 44 | self.training_logs[rank].append((self.global_boost_round, boost_round)) 45 | 46 | def should_die(self, rank: int): 47 | """Returns True if the actor should terminate the training job now.""" 48 | die = False 49 | for round in range(self.global_boost_round + 1): 50 | # Loop through all rounds until now to deal with race conditions 51 | if rank in self.scheduled_kill[round]: 52 | self.scheduled_kill[round].remove(rank) 53 | die = True 54 | return die 55 | 56 | def should_sleep(self, rank: int): 57 | """Returns True if the actor should not finish data loading, yet.""" 58 | if self.delayed_return[rank]: 59 | for start, end in self.delayed_return[rank]: 60 | if start <= self.global_boost_round < end: 61 | return True 62 | return False 63 | 64 | def get_logs(self): 65 | return self.training_logs 66 | 67 | 68 | class DelayedLoadingCallback(DistributedCallback): 69 | """Used to control when actors return to training""" 70 | 71 | def __init__(self, ft_manager: ActorHandle, reload_data=True, sleep_time=0.5): 72 | self.ft_manager = ft_manager 73 | self.reload_data = reload_data 74 | self.sleep_time = sleep_time 75 | 76 | def after_data_loading(self, actor, data, *args, **kwargs): 77 | print(f"Rank {actor.rank} - after load") 78 | while ray.get(self.ft_manager.should_sleep.remote(actor.rank)): 79 | time.sleep(self.sleep_time) 80 | print(f"Rank {actor.rank} - returning now") 81 | 82 | 83 | class DieCallback(TrainingCallback): 84 | """Used to control when actors should die during training. 85 | 86 | Also can add delay to each boosting round. 87 | """ 88 | 89 | def __init__(self, ft_manager: ActorHandle, training_delay: float = 0): 90 | self.ft_manager = ft_manager 91 | self.training_delay = training_delay 92 | super(DieCallback, self).__init__() 93 | 94 | def before_iteration(self, model, epoch, evals_log): 95 | if ray.get(self.ft_manager.should_die.remote(get_actor_rank())): 96 | pid = os.getpid() 97 | print(f"Killing process: {pid}") 98 | print(f"Rank {get_actor_rank()} will now die.") 99 | time.sleep(1) 100 | os.kill(pid, 9) 101 | time.sleep(10) # Don't continue training, just die 102 | 103 | def after_iteration(self, model, epoch, evals_log): 104 | # ray.get to make sure this is up to date in the next iteration 105 | ray.get(self.ft_manager.log_iteration.remote(get_actor_rank(), epoch)) 106 | if self.training_delay > 0: 107 | time.sleep(self.training_delay) 108 | if get_actor_rank() == 0: 109 | ray.get(self.ft_manager.inc_boost_round.remote(get_actor_rank())) 110 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/benchmark_cpu_gpu.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | import time 6 | 7 | import ray 8 | 9 | from xgboost_ray import ( 10 | RayDeviceQuantileDMatrix, 11 | RayDMatrix, 12 | RayFileType, 13 | RayParams, 14 | train, 15 | ) 16 | from xgboost_ray.tests.utils import create_parquet_in_tempdir 17 | 18 | if "OMP_NUM_THREADS" in os.environ: 19 | del os.environ["OMP_NUM_THREADS"] 20 | 21 | 22 | def train_ray( 23 | path, 24 | num_workers, 25 | num_boost_rounds, 26 | num_files=0, 27 | regression=False, 28 | use_gpu=False, 29 | smoke_test=False, 30 | ray_params=None, 31 | xgboost_params=None, 32 | **kwargs, 33 | ): 34 | if num_files: 35 | files = sorted(glob.glob(f"{path}/**/*.parquet")) 36 | while num_files > len(files): 37 | files = files + files 38 | path = files[0:num_files] 39 | 40 | use_device_matrix = False 41 | if use_gpu: 42 | try: 43 | import cupy # noqa: F401 44 | 45 | use_device_matrix = True 46 | except ImportError: 47 | use_device_matrix = False 48 | 49 | if use_device_matrix: 50 | dtrain = RayDeviceQuantileDMatrix( 51 | path, 52 | num_actors=num_workers, 53 | label="labels", 54 | ignore=["partition"], 55 | filetype=RayFileType.PARQUET, 56 | ) 57 | else: 58 | dtrain = RayDMatrix( 59 | path, 60 | num_actors=num_workers, 61 | label="labels", 62 | ignore=["partition"], 63 | filetype=RayFileType.PARQUET, 64 | ) 65 | 66 | config = xgboost_params or {"tree_method": "hist" if not use_gpu else "gpu_hist"} 67 | if not regression: 68 | # Classification 69 | config.update( 70 | { 71 | "objective": "binary:logistic", 72 | "eval_metric": ["logloss", "error"], 73 | } 74 | ) 75 | else: 76 | # Regression 77 | config.update( 78 | { 79 | "objective": "reg:squarederror", 80 | "eval_metric": ["logloss", "rmse"], 81 | } 82 | ) 83 | 84 | start = time.time() 85 | evals_result = {} 86 | bst = train( 87 | config, 88 | dtrain, 89 | evals_result=evals_result, 90 | num_boost_round=num_boost_rounds, 91 | ray_params=ray_params 92 | or RayParams( 93 | max_actor_restarts=2, 94 | num_actors=num_workers, 95 | cpus_per_actor=4 if not smoke_test else 1, 96 | gpus_per_actor=0 if not use_gpu else 1, 97 | ), 98 | evals=[(dtrain, "train")], 99 | **kwargs, 100 | ) 101 | taken = time.time() - start 102 | print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") 103 | 104 | bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) 105 | print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) 106 | return bst, taken 107 | 108 | 109 | if __name__ == "__main__": 110 | parser = argparse.ArgumentParser(description="Process some integers.") 111 | 112 | parser.add_argument("num_workers", type=int, help="num workers") 113 | parser.add_argument("num_rounds", type=int, help="num boost rounds") 114 | parser.add_argument("num_files", type=int, help="num files") 115 | 116 | parser.add_argument( 117 | "--file", default="/data/parted.parquet", type=str, help="data file" 118 | ) 119 | 120 | parser.add_argument( 121 | "--regression", action="store_true", default=False, help="regression" 122 | ) 123 | 124 | parser.add_argument("--gpu", action="store_true", default=False, help="gpu") 125 | 126 | parser.add_argument( 127 | "--smoke-test", action="store_true", default=False, help="smoke test" 128 | ) 129 | 130 | args = parser.parse_args() 131 | 132 | num_workers = args.num_workers 133 | num_boost_rounds = args.num_rounds 134 | num_files = args.num_files 135 | use_gpu = args.gpu 136 | 137 | temp_dir = None 138 | if args.smoke_test: 139 | temp_dir, path = create_parquet_in_tempdir( 140 | filename="smoketest.parquet", 141 | num_rows=args.num_workers * 500, 142 | num_features=4, 143 | num_classes=2, 144 | num_partitions=args.num_workers * 10, 145 | ) 146 | use_gpu = False 147 | else: 148 | path = args.file 149 | if not os.path.exists(path): 150 | raise ValueError( 151 | f"Benchmarking data not found: {path}." 152 | f"\nFIX THIS by running `python create_test_data.py` first." 153 | ) 154 | 155 | init_start = time.time() 156 | if args.smoke_test: 157 | ray.init(num_cpus=num_workers) 158 | else: 159 | ray.init(address="auto") 160 | init_taken = time.time() - init_start 161 | 162 | full_start = time.time() 163 | bst, train_taken = train_ray( 164 | path=path, 165 | num_workers=num_workers, 166 | num_boost_rounds=num_boost_rounds, 167 | num_files=num_files, 168 | regression=args.regression, 169 | use_gpu=use_gpu, 170 | smoke_test=args.smoke_test, 171 | ) 172 | full_taken = time.time() - full_start 173 | print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)") 174 | 175 | if args.smoke_test: 176 | shutil.rmtree(temp_dir, ignore_errors=True) 177 | else: 178 | with open("res.csv", "at") as fp: 179 | fp.writelines( 180 | [ 181 | ",".join( 182 | [ 183 | str(e) 184 | for e in [ 185 | num_workers, 186 | num_files, 187 | int(use_gpu), 188 | num_boost_rounds, 189 | init_taken, 190 | full_taken, 191 | train_taken, 192 | ] 193 | ] 194 | ) 195 | + "\n" 196 | ] 197 | ) 198 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/cluster_cpu.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: xgboost_ray_release_tests_cpu_{{env["NUM_WORKERS"] | default(0)}} 2 | 3 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 4 | upscaling_speed: 9999 5 | 6 | idle_timeout_minutes: 15 7 | 8 | docker: 9 | image: anyscale/ray:nightly 10 | container_name: ray_container 11 | pull_before_run: true 12 | run_options: 13 | - --privileged 14 | 15 | provider: 16 | type: aws 17 | region: us-west-2 18 | availability_zone: us-west-2a 19 | cache_stopped_nodes: false 20 | 21 | available_node_types: 22 | cpu_4_ondemand: 23 | node_config: 24 | InstanceType: m5.xlarge 25 | resources: {"CPU": 4} 26 | min_workers: {{env["NUM_WORKERS"] | default(0)}} 27 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 28 | 29 | auth: 30 | ssh_user: ubuntu 31 | 32 | head_node_type: cpu_4_ondemand 33 | worker_default_node_type: cpu_4_ondemand 34 | 35 | file_mounts_sync_continuously: false 36 | 37 | setup_commands: 38 | - pip install -U {{env["RAY_WHEEL"] | default("ray")}} 39 | - pip install dask pytest 40 | - pip install -U {{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}} 41 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/cluster_ft.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: xgboost_ray_release_tests_ft_cluster 2 | 3 | max_workers: 9 4 | 5 | upscaling_speed: 32 6 | 7 | idle_timeout_minutes: 15 8 | 9 | docker: 10 | image: anyscale/ray:nightly 11 | container_name: ray_container 12 | pull_before_run: true 13 | 14 | provider: 15 | type: aws 16 | region: us-west-2 17 | availability_zone: us-west-2a 18 | cache_stopped_nodes: false 19 | 20 | available_node_types: 21 | cpu_16_ondemand: 22 | node_config: 23 | InstanceType: m5.4xlarge 24 | resources: {"CPU": 16} 25 | min_workers: 9 26 | max_workers: 9 27 | 28 | file_mounts: { 29 | "/release_tests": "./" 30 | } 31 | 32 | 33 | auth: 34 | ssh_user: ubuntu 35 | 36 | head_node_type: cpu_16_ondemand 37 | worker_default_node_type: cpu_16_ondemand 38 | 39 | setup_commands: 40 | - pip install -U awscli fsspec petastorm s3fs botocore 41 | - pip install -U {{env["RAY_WHEEL"] | default("ray")}} 42 | - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh 43 | 44 | file_mounts_sync_continuously: false 45 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/cluster_gpu.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: xgboost_ray_release_tests_gpu_{{env["NUM_WORKERS"] | default(0)}} 2 | 3 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 4 | upscaling_speed: 9999 5 | 6 | idle_timeout_minutes: 15 7 | 8 | docker: 9 | image: anyscale/ray:nightly-gpu 10 | container_name: ray_container 11 | pull_before_run: true 12 | run_options: 13 | - --privileged 14 | 15 | provider: 16 | type: aws 17 | region: us-west-2 18 | availability_zone: us-west-2a 19 | cache_stopped_nodes: false 20 | 21 | available_node_types: 22 | gpu_4_ondemand: 23 | node_config: 24 | InstanceType: p2.xlarge 25 | resources: {"CPU": 4, "GPU": 1} 26 | min_workers: {{env["NUM_WORKERS"] | default(0)}} 27 | max_workers: {{env["NUM_WORKERS"] | default(0)}} 28 | 29 | auth: 30 | ssh_user: ubuntu 31 | 32 | head_node_type: gpu_4_ondemand 33 | worker_default_node_type: gpu_4_ondemand 34 | 35 | file_mounts: { 36 | "~/xgboost_tests": "." 37 | } 38 | 39 | file_mounts_sync_continuously: false 40 | 41 | setup_commands: 42 | - pip install -U pyarrow cupy-cuda101 43 | - pip install -U {{env["RAY_WHEEL"] | default("ray")}} 44 | - export XGBOOST_RAY_PACKAGE="{{env["XGBOOST_RAY_PACKAGE"] | default("xgboost_ray")}}" && /bin/bash ~/xgboost_tests/setup_xgboost.sh 45 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/create_learnable_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.datasets import make_classification, make_regression 7 | 8 | if __name__ == "__main__": 9 | if "OMP_NUM_THREADS" in os.environ: 10 | del os.environ["OMP_NUM_THREADS"] 11 | 12 | parser = argparse.ArgumentParser(description="Create fake data.") 13 | parser.add_argument("filename", type=str, default="/data/parted.parquet/") 14 | parser.add_argument( 15 | "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" 16 | ) 17 | parser.add_argument( 18 | "-p", 19 | "--num-partitions", 20 | required=False, 21 | type=int, 22 | default=100, 23 | help="num partitions", 24 | ) 25 | parser.add_argument( 26 | "-c", 27 | "--num-cols", 28 | required=False, 29 | type=int, 30 | default=4, 31 | help="num columns (features)", 32 | ) 33 | parser.add_argument( 34 | "-C", "--num-classes", required=False, type=int, default=2, help="num classes" 35 | ) 36 | parser.add_argument( 37 | "-s", "--seed", required=False, type=int, default=1234, help="random seed" 38 | ) 39 | parser.add_argument( 40 | "-T", 41 | "--target", 42 | required=False, 43 | type=float, 44 | default=0.8, 45 | help="target accuracy", 46 | ) 47 | 48 | args = parser.parse_args() 49 | 50 | seed = int(args.seed) 51 | np.random.seed(seed) 52 | 53 | num_rows = int(args.num_rows) 54 | num_cols = int(args.num_cols) 55 | num_classes = int(args.num_classes) 56 | target = float(args.target) 57 | 58 | if num_classes > 0: 59 | x, y = make_classification( 60 | n_samples=num_rows, 61 | n_features=num_cols, 62 | n_informative=num_cols // 2, 63 | n_redundant=num_cols // 10, 64 | n_repeated=0, 65 | n_classes=num_classes, 66 | n_clusters_per_class=2, 67 | flip_y=1 - target, 68 | random_state=seed, 69 | ) 70 | else: 71 | x, y = make_regression( 72 | n_samples=num_rows, 73 | n_features=num_cols, 74 | n_informative=num_cols // 2, 75 | n_targets=1, 76 | noise=0.1, 77 | random_state=seed, 78 | ) 79 | 80 | filename = args.filename 81 | num_partitions = args.num_partitions 82 | 83 | data = pd.DataFrame(x, columns=[f"feature_{i}" for i in range(num_cols)]) 84 | 85 | rows_per_partition = np.floor(len(data) / num_partitions) 86 | 87 | partition_arr = np.repeat(np.arange(num_partitions), repeats=rows_per_partition) 88 | if len(partition_arr) < len(data): 89 | # If this was not evenly divided, append 90 | missing = len(data) - len(partition_arr) 91 | partition_arr = np.append(partition_arr, np.arange(missing)) 92 | 93 | partition = pd.Series(partition_arr, copy=False, dtype=np.int32) 94 | 95 | data["labels"] = y 96 | data["partition"] = partition 97 | 98 | os.makedirs(filename, 0o755, exist_ok=True) 99 | 100 | # Write partition-wise to avoid OOM errors 101 | for i in range(num_partitions): 102 | part = data[partition_arr == i] 103 | part.to_parquet( 104 | filename, 105 | partition_cols=["partition"], 106 | engine="pyarrow", 107 | partition_filename_cb=lambda key: f"part_{key[0]}.parquet", 108 | ) 109 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/create_test_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | 6 | from xgboost_ray.tests.utils import create_parquet 7 | 8 | if __name__ == "__main__": 9 | if "OMP_NUM_THREADS" in os.environ: 10 | del os.environ["OMP_NUM_THREADS"] 11 | 12 | parser = argparse.ArgumentParser(description="Create fake data.") 13 | parser.add_argument( 14 | "filename", type=str, default="/data/parted.parquet/", help="ray/dask" 15 | ) 16 | parser.add_argument( 17 | "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" 18 | ) 19 | parser.add_argument( 20 | "-p", 21 | "--num-partitions", 22 | required=False, 23 | type=int, 24 | default=100, 25 | help="num partitions", 26 | ) 27 | parser.add_argument( 28 | "-c", 29 | "--num-cols", 30 | required=False, 31 | type=int, 32 | default=4, 33 | help="num columns (features)", 34 | ) 35 | parser.add_argument( 36 | "-C", "--num-classes", required=False, type=int, default=2, help="num classes" 37 | ) 38 | parser.add_argument( 39 | "-s", "--seed", required=False, type=int, default=1234, help="random seed" 40 | ) 41 | 42 | args = parser.parse_args() 43 | 44 | np.random.seed(args.seed) 45 | create_parquet( 46 | args.filename, 47 | num_rows=int(args.num_rows), 48 | num_partitions=int(args.num_partitions), 49 | num_features=int(args.num_cols), 50 | num_classes=int(args.num_classes), 51 | ) 52 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/custom_objective_metric.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | from xgboost_ray.tests.test_xgboost_api import XGBoostAPITest 4 | 5 | 6 | class XGBoostDistributedAPITest(XGBoostAPITest): 7 | def _init_ray(self): 8 | if not ray.is_initialized(): 9 | ray.init(address="auto") 10 | 11 | 12 | if __name__ == "__main__": 13 | import sys 14 | 15 | import pytest 16 | 17 | sys.exit(pytest.main(["-v", f"{__file__}::XGBoostDistributedAPITest"])) 18 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/run_e2e_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | NOW=$(date +%s) 9 | export SESSION_NAME="xgboost_ray_ci_gpu_${NOW}" 10 | export NUM_WORKERS=3 11 | export XGBOOST_RAY_PACKAGE="git+https://github.com/ray-project/xgboost_ray.git@${GITHUB_SHA:-master}#egg=xgboost_ray" 12 | export NO_TMUX=1 13 | 14 | ./start_gpu_cluster.sh 15 | ./submit_cpu_gpu_benchmark.sh 4 100 100 --gpu --file /data/classification.parquet 16 | anyscale down "${SESSION_NAME}" 17 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/setup_xgboost.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install pytest 4 | # Uninstall any existing xgboost_ray repositories 5 | pip uninstall -y xgboost_ray || true 6 | 7 | # Install xgboost package 8 | pip install -U "${XGBOOST_RAY_PACKAGE:-xgboost_ray}" 9 | 10 | # Create test dataset 11 | sudo mkdir -p /data || true 12 | sudo chown ray:1000 /data || true 13 | rm -rf /data/classification.parquet || true 14 | cp -R /tmp/ray_tmp_mount/xgboost_tests ~/xgboost_tests || echo "Copy failed" 15 | python ~/xgboost_tests/create_test_data.py /data/classification.parquet --seed 1234 --num-rows 1000000 --num-cols 40 --num-partitions 100 --num-classes 2 16 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/start_cpu_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}" 9 | export NUM_WORKERS="${NUM_WORKERS:-3}" 10 | 11 | SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_cpu_$(date +%s)} 12 | 13 | echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)" 14 | echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}" 15 | 16 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_cpu.yaml ${SESSION_NAME}" 17 | 18 | echo "Running: ${CMD}" 19 | ${CMD} 20 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/start_ft_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}" 9 | 10 | SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_ft_$(date +%s)} 11 | 12 | echo "Starting FT cluster" 13 | echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}" 14 | 15 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_ft.yaml ${SESSION_NAME}" 16 | 17 | echo "Running: ${CMD}" 18 | ${CMD} 19 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/start_gpu_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | export XGBOOST_RAY_PACKAGE="${XGBOOST_RAY_PACKAGE:-xgboost_ray}" 9 | export NUM_WORKERS="${NUM_WORKERS:-3}" 10 | 11 | SESSION_NAME=${SESSION_NAME:-xgboost_ray_release_gpu_$(date +%s)} 12 | 13 | echo "Starting GPU cluster with ${NUM_WORKERS} worker nodes (plus the head node)" 14 | echo "This will install xgboost_ray using the following package: ${XGBOOST_RAY_PACKAGE}" 15 | 16 | CMD="anyscale up --cloud-name anyscale_default_cloud --config cluster_gpu.yaml ${SESSION_NAME}" 17 | 18 | echo "Running: ${CMD}" 19 | ${CMD} 20 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | ANYSCALE_CMD="python ~/xgboost_tests/benchmark_cpu_gpu.py $*" 9 | 10 | SESSION_STR="" 11 | if [ -n "${SESSION_NAME}" ]; then 12 | SESSION_STR="--session-name ${SESSION_NAME}" 13 | fi 14 | 15 | TMUX="--tmux" 16 | if [ "${NO_TMUX}" = "1" ]; then 17 | TMUX="" 18 | fi 19 | 20 | CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}" 21 | 22 | echo "Running: ${CMD}" 23 | ${CMD} 24 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/submit_ft_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "./.anyscale.yaml" ]; then 4 | echo "Anyscale project not initialized. Please run 'anyscale init'" 5 | exit 1 6 | fi 7 | 8 | ANYSCALE_CMD="python ~/xgboost_tests/benchmark_ft.py $*" 9 | 10 | SESSION_STR="" 11 | if [ -n "${SESSION_NAME}" ]; then 12 | SESSION_STR="--session-name ${SESSION_NAME}" 13 | fi 14 | 15 | TMUX="--tmux" 16 | if [ "${NO_TMUX}" = "1" ]; then 17 | TMUX="" 18 | fi 19 | 20 | CMD="anyscale exec ${TMUX} ${SESSION_STR} -- ${ANYSCALE_CMD}" 21 | 22 | echo "Running: ${CMD}" 23 | ${CMD} 24 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/tune_cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: xgboost_ray_release_tests_tune 2 | min_workers: 4 3 | max_workers: 4 4 | initial_workers: 4 5 | autoscaling_mode: default 6 | docker: 7 | image: "rayproject/ray:latest" 8 | container_name: ray_container 9 | pull_before_run: false 10 | run_options: 11 | - --privileged 12 | target_utilization_fraction: 0.8 13 | idle_timeout_minutes: 5 14 | provider: 15 | type: aws 16 | region: us-west-2 17 | availability_zone: us-west-2a 18 | cache_stopped_nodes: true 19 | auth: 20 | ssh_user: ubuntu 21 | head_node: 22 | InstanceType: m5.xlarge 23 | ImageId: ami-05ac7a76b4c679a79 24 | worker_nodes: 25 | InstanceType: m5.xlarge 26 | ImageId: ami-05ac7a76b4c679a79 27 | InstanceMarketOptions: 28 | MarketType: spot 29 | 30 | file_mounts: { 31 | "/release_tests": "./" 32 | } 33 | cluster_synced_files: [] 34 | file_mounts_sync_continuously: true 35 | initialization_commands: [] 36 | setup_commands: 37 | - pip install -U ray 38 | - pip install -U git+https://github.com/ray-project/xgboost_ray#egg=xgboost-ray 39 | - pip install -U git+https://github.com/amogkam/xgboost_ray.git@colocation#egg=xgboost-ray 40 | - mkdir -p /data 41 | - rm -rf /data/tune_test.parquet || true 42 | - python /release_tests/create_test_data.py /data/tune_test.parquet --seed 1234 --num-rows 2000 --num-cols 4 --num-partitions 40 --num-classes 2 43 | head_setup_commands: [] 44 | worker_setup_commands: [] 45 | head_start_ray_commands: 46 | - ray stop 47 | - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"actor_cpus\": 0}'" 48 | worker_start_ray_commands: 49 | - ray stop 50 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"actor_cpus\": 4}'" 51 | metadata: 52 | anyscale: 53 | working_dir: "/release_tests" 54 | -------------------------------------------------------------------------------- /xgboost_ray/tests/release/tune_placement.py: -------------------------------------------------------------------------------- 1 | """ 2 | NOTE: This example is currently broken (very outdated) and not run in CI. 3 | 4 | Test Ray Tune trial placement across cluster nodes. 5 | 6 | Example: Run this script on a cluster with 4 workers nodes a 4 CPUs. 7 | 8 | ray up -y tune_cluster.yaml 9 | 10 | ray attach tune_cluster.yaml 11 | 12 | python /release_tests/tune_placement.py 4 4 10 10 --fake-data 13 | 14 | This starts 4 trials à 4 actors training 10 boost rounds on 10 data 15 | partitions per actor. This will use fake data created before training. 16 | 17 | This test will then confirm that actors of the same trial are PACKed 18 | on the same nodes. In practice we check that each node IP address only 19 | hosts actors of the same Ray Tune trial. 20 | """ 21 | 22 | import argparse 23 | import json 24 | import os 25 | import shutil 26 | import tempfile 27 | import time 28 | from collections import defaultdict 29 | 30 | import ray 31 | import ray.train 32 | from benchmark_cpu_gpu import train_ray 33 | from ray import tune 34 | from ray.tune.integration.docker import DockerSyncer 35 | from ray.tune.session import get_trial_id 36 | from ray.util import get_node_ip_address 37 | 38 | from xgboost_ray import RayParams 39 | from xgboost_ray.compat import TrainingCallback 40 | from xgboost_ray.session import put_queue 41 | from xgboost_ray.tests.utils import create_parquet 42 | 43 | if "OMP_NUM_THREADS" in os.environ: 44 | del os.environ["OMP_NUM_THREADS"] 45 | 46 | 47 | class PlacementCallback(TrainingCallback): 48 | """This callback collects the Ray Tune trial ID and node IP""" 49 | 50 | def before_training(self, model): 51 | ip_address = get_node_ip_address() 52 | put_queue(ip_address) 53 | return model 54 | 55 | def after_iteration(self, model, epoch, evals_log): 56 | if epoch == 1: 57 | time.sleep(2) 58 | elif epoch == 2: 59 | time.sleep(8) 60 | 61 | 62 | def tune_test( 63 | path, 64 | num_trials, 65 | num_workers, 66 | num_boost_rounds, 67 | num_files=0, 68 | regression=False, 69 | use_gpu=False, 70 | fake_data=False, 71 | smoke_test=False, 72 | ): 73 | ray_params = RayParams( 74 | elastic_training=False, 75 | max_actor_restarts=0, 76 | num_actors=num_workers, 77 | cpus_per_actor=1, 78 | gpus_per_actor=0 if not use_gpu else 1, 79 | ) 80 | 81 | def local_train(config): 82 | temp_dir = None 83 | if fake_data or smoke_test: 84 | temp_dir = "/tmp/release_test_data" 85 | if os.path.exists(temp_dir): 86 | shutil.rmtree(temp_dir) 87 | 88 | os.makedirs(temp_dir, 0o755) 89 | local_path = os.path.join(temp_dir, "smoketest.parquet") 90 | 91 | create_parquet( 92 | filename=local_path, 93 | num_rows=args.num_workers * 500, 94 | num_features=4, 95 | num_classes=2, 96 | num_partitions=args.num_workers * 10, 97 | ) 98 | else: 99 | if not os.path.exists(path): 100 | raise ValueError( 101 | f"Benchmarking data not found: {path}." 102 | f"\nFIX THIS by running `python create_test_data.py` " 103 | f"on all nodes first." 104 | ) 105 | local_path = path 106 | 107 | xgboost_params = { 108 | "tree_method": "hist" if not use_gpu else "gpu_hist", 109 | } 110 | 111 | xgboost_params.update( 112 | { 113 | "objective": "binary:logistic", 114 | "eval_metric": ["logloss", "error"], 115 | } 116 | ) 117 | 118 | xgboost_params.update(config) 119 | 120 | additional_results = {} 121 | 122 | bst, time_taken = train_ray( 123 | path=local_path, 124 | num_workers=num_workers, 125 | num_boost_rounds=num_boost_rounds, 126 | num_files=num_files, 127 | regression=regression, 128 | use_gpu=use_gpu, 129 | smoke_test=smoke_test, 130 | ray_params=ray_params, 131 | xgboost_params=xgboost_params, 132 | # kwargs 133 | additional_results=additional_results, 134 | callbacks=[PlacementCallback()], 135 | ) 136 | 137 | bst.save_model("tuned.xgb") 138 | 139 | trial_ips = [] 140 | for rank, ips in enumerate(additional_results["callback_returns"]): 141 | for ip in ips: 142 | trial_ips.append(ip) 143 | 144 | tune_trial = get_trial_id() 145 | with tempfile.TemporaryDirectory() as temp_checkpoint_dir: 146 | with open( 147 | os.path.join(temp_checkpoint_dir, "callback_returns.json"), "wt" 148 | ) as f: 149 | json.dump({tune_trial: trial_ips}, f) 150 | ray.train.report( 151 | {}, checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir) 152 | ) 153 | 154 | if temp_dir: 155 | shutil.rmtree(temp_dir) 156 | 157 | search_space = { 158 | "eta": tune.loguniform(1e-4, 1e-1), 159 | "subsample": tune.uniform(0.5, 1.0), 160 | "max_depth": tune.randint(1, 9), 161 | } 162 | 163 | analysis = tune.run( 164 | local_train, 165 | config=search_space, 166 | num_samples=num_trials, 167 | sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer), 168 | resources_per_trial=ray_params.get_tune_resources(), 169 | ) 170 | 171 | # In our PACK scheduling, we expect that each IP hosts only workers 172 | # for one Ray Tune trial. 173 | ip_to_trials = defaultdict(list) 174 | for trial in analysis.trials: 175 | trial = trial 176 | with open( 177 | os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt" 178 | ) as f: 179 | trial_to_ips = json.load(f) 180 | for tune_trial, ips in trial_to_ips.items(): 181 | for node_ip in ips: 182 | ip_to_trials[node_ip].append(tune_trial) 183 | 184 | fail = False 185 | for ip, trial_ids in ip_to_trials.items(): 186 | print(f"For IP {ip} got trial IDs {trial_ids}") 187 | fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids) 188 | 189 | if fail: 190 | raise ValueError("Different trial IDs found on same node.") 191 | else: 192 | print("Success.") 193 | 194 | 195 | if __name__ == "__main__": 196 | parser = argparse.ArgumentParser(description="Test Ray Tune placement " "strategy") 197 | 198 | parser.add_argument("num_trials", type=int, help="num trials") 199 | parser.add_argument("num_workers", type=int, help="num workers (per trial)") 200 | parser.add_argument("num_rounds", type=int, help="num boost rounds") 201 | parser.add_argument("num_files", type=int, help="num files (per trial)") 202 | 203 | parser.add_argument( 204 | "--file", default="/data/parted.parquet", type=str, help="data file" 205 | ) 206 | 207 | parser.add_argument( 208 | "--regression", action="store_true", default=False, help="regression" 209 | ) 210 | 211 | parser.add_argument("--gpu", action="store_true", default=False, help="gpu") 212 | 213 | parser.add_argument( 214 | "--fake-data", action="store_true", default=False, help="fake data" 215 | ) 216 | 217 | parser.add_argument( 218 | "--smoke-test", action="store_true", default=False, help="smoke test" 219 | ) 220 | 221 | args = parser.parse_args() 222 | 223 | num_trials = args.num_trials 224 | num_workers = args.num_workers 225 | num_boost_rounds = args.num_rounds 226 | num_files = args.num_files 227 | use_gpu = args.gpu 228 | 229 | if args.smoke_test: 230 | use_gpu = False 231 | 232 | init_start = time.time() 233 | if args.smoke_test: 234 | ray.init(num_cpus=num_workers) 235 | else: 236 | ray.init(address="auto") 237 | 238 | full_start = time.time() 239 | tune_test( 240 | path=args.file, 241 | num_trials=num_trials, 242 | num_workers=num_workers, 243 | num_boost_rounds=num_boost_rounds, 244 | num_files=num_files, 245 | regression=args.regression, 246 | use_gpu=use_gpu, 247 | fake_data=args.fake_data, 248 | smoke_test=args.smoke_test, 249 | ) 250 | full_taken = time.time() - full_start 251 | print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds ") 252 | -------------------------------------------------------------------------------- /xgboost_ray/tests/test_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import ray 5 | from ray.util.client.ray_client_helpers import ray_start_client_server 6 | 7 | from xgboost_ray.data_sources.ray_dataset import RAY_DATASET_AVAILABLE 8 | 9 | 10 | @pytest.fixture 11 | def start_client_server_4_cpus(): 12 | ray.init(num_cpus=4) 13 | with ray_start_client_server() as client: 14 | yield client 15 | 16 | 17 | @pytest.fixture 18 | def start_client_server_5_cpus(): 19 | ray.init(num_cpus=5) 20 | with ray_start_client_server() as client: 21 | yield client 22 | 23 | 24 | @pytest.fixture 25 | def start_client_server_5_cpus_modin(monkeypatch): 26 | monkeypatch.setenv("__MODIN_AUTOIMPORT_PANDAS__", "1") 27 | ray.init(num_cpus=5, runtime_env={"env_vars": {"__MODIN_AUTOIMPORT_PANDAS__": "1"}}) 28 | with ray_start_client_server() as client: 29 | yield client 30 | 31 | 32 | def test_simple_train(start_client_server_4_cpus): 33 | assert ray.util.client.ray.is_connected() 34 | from xgboost_ray.examples.simple import main 35 | 36 | main(num_actors=4, cpus_per_actor=1) 37 | 38 | 39 | @pytest.mark.skipif(os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests") 40 | def test_simple_tune(start_client_server_4_cpus): 41 | assert ray.util.client.ray.is_connected() 42 | from xgboost_ray.examples.simple_tune import main 43 | 44 | main(cpus_per_actor=1, num_actors=1, num_samples=4) 45 | 46 | 47 | def test_simple_dask(start_client_server_5_cpus): 48 | assert ray.util.client.ray.is_connected() 49 | from xgboost_ray.examples.simple_dask import main 50 | 51 | main(cpus_per_actor=1, num_actors=4) 52 | 53 | 54 | def test_simple_modin(start_client_server_5_cpus_modin): 55 | assert ray.util.client.ray.is_connected() 56 | from xgboost_ray.examples.simple_modin import main 57 | 58 | main(cpus_per_actor=1, num_actors=4) 59 | 60 | 61 | def test_client_actor_cpus(start_client_server_5_cpus): 62 | assert ray.util.client.ray.is_connected() 63 | from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy 64 | 65 | @ray.remote 66 | class DummyTrainActor: 67 | def test(self): 68 | import xgboost_ray 69 | 70 | return xgboost_ray.main._ray_get_actor_cpus() 71 | 72 | actor = DummyTrainActor.options(num_cpus=2).remote() 73 | assert ray.get(actor.test.remote()) == 2 74 | 75 | pg = ray.util.placement_group([{"CPU": 2}]) 76 | ray.get(pg.ready()) 77 | actor2 = DummyTrainActor.options( 78 | num_cpus=2, 79 | scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg), 80 | ).remote() 81 | assert ray.get(actor2.test.remote()) == 2 82 | 83 | 84 | @pytest.mark.skipif( 85 | not RAY_DATASET_AVAILABLE, 86 | reason="Ray datasets are not available in this version of Ray", 87 | ) 88 | def test_simple_ray_dataset(start_client_server_5_cpus): 89 | assert ray.util.client.ray.is_connected() 90 | from xgboost_ray.examples.simple_ray_dataset import main 91 | 92 | main(cpus_per_actor=1, num_actors=4) 93 | 94 | 95 | if __name__ == "__main__": 96 | import sys 97 | 98 | import pytest # noqa: F811 99 | 100 | sys.exit(pytest.main(["-v", __file__])) 101 | -------------------------------------------------------------------------------- /xgboost_ray/tests/test_colocation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | from unittest.mock import patch 6 | 7 | import numpy as np 8 | import pytest 9 | import ray 10 | from ray.util.queue import _QueueActor 11 | 12 | from xgboost_ray import RayDMatrix, RayParams, train 13 | from xgboost_ray.main import _train 14 | from xgboost_ray.util import _EventActor 15 | 16 | 17 | class _MockQueueActor(_QueueActor): 18 | def get_node_id(self): 19 | return ray.get_runtime_context().get_node_id() 20 | 21 | 22 | class _MockEventActor(_EventActor): 23 | def get_node_id(self): 24 | return ray.get_runtime_context().get_node_id() 25 | 26 | 27 | @pytest.mark.usefixtures("ray_start_cluster") 28 | class TestColocation(unittest.TestCase): 29 | def setUp(self) -> None: 30 | repeat = 8 # Repeat data a couple of times for stability 31 | self.x = np.array( 32 | [ 33 | [1, 0, 0, 0], # Feature 0 -> Label 0 34 | [0, 1, 0, 0], # Feature 1 -> Label 1 35 | [0, 0, 1, 1], # Feature 2+3 -> Label 0 36 | [0, 0, 1, 0], # Feature 2+!3 -> Label 1 37 | ] 38 | * repeat 39 | ) 40 | self.y = np.array([0, 1, 0, 1] * repeat) 41 | 42 | self.params = { 43 | "booster": "gbtree", 44 | "tree_method": "hist", 45 | "nthread": 1, 46 | "max_depth": 2, 47 | "objective": "binary:logistic", 48 | "seed": 1000, 49 | } 50 | 51 | self.kwargs = {} 52 | 53 | self.tmpdir = str(tempfile.mkdtemp()) 54 | 55 | self.die_lock_file = "/tmp/died_worker.lock" 56 | if os.path.exists(self.die_lock_file): 57 | os.remove(self.die_lock_file) 58 | 59 | def tearDown(self) -> None: 60 | if os.path.exists(self.tmpdir): 61 | shutil.rmtree(self.tmpdir) 62 | ray.shutdown() 63 | 64 | @patch("ray.util.queue._QueueActor", _MockQueueActor) 65 | @patch("xgboost_ray.util._EventActor", _MockEventActor) 66 | def test_communication_colocation(self): 67 | """Checks that Queue and Event actors are colocated with the driver.""" 68 | os.environ["RXGB_COMMUNICATION_SOFT_PLACEMENT"] = "0" 69 | 70 | with self.ray_start_cluster() as cluster: 71 | cluster.add_node(num_cpus=3) 72 | cluster.add_node(num_cpus=3) 73 | cluster.wait_for_nodes() 74 | ray.init(address=cluster.address) 75 | 76 | local_node = ray.get_runtime_context().get_node_id() 77 | 78 | # Note that these will have the same IP in the test cluster 79 | assert len(ray.nodes()) == 2 80 | assert local_node in [node["NodeID"] for node in ray.nodes()] 81 | 82 | def _mock_train(*args, _training_state, **kwargs): 83 | assert ( 84 | ray.get(_training_state.queue.actor.get_node_id.remote()) 85 | == local_node 86 | ) 87 | assert ( 88 | ray.get(_training_state.stop_event.actor.get_node_id.remote()) 89 | == local_node 90 | ) 91 | return _train(*args, _training_state=_training_state, **kwargs) 92 | 93 | with patch("xgboost_ray.main._train") as mocked: 94 | mocked.side_effect = _mock_train 95 | train( 96 | self.params, 97 | RayDMatrix(self.x, self.y), 98 | num_boost_round=2, 99 | ray_params=RayParams(max_actor_restarts=1, num_actors=6), 100 | ) 101 | 102 | os.environ.pop("RXGB_COMMUNICATION_SOFT_PLACEMENT", None) 103 | 104 | def test_no_tune_spread(self): 105 | """Tests whether workers are spread when not using Tune.""" 106 | with self.ray_start_cluster() as cluster: 107 | cluster.add_node(num_cpus=2) 108 | cluster.add_node(num_cpus=2) 109 | cluster.wait_for_nodes() 110 | ray.init(address=cluster.address) 111 | 112 | ray_params = RayParams(max_actor_restarts=1, num_actors=2, cpus_per_actor=2) 113 | 114 | def _mock_train(*args, _training_state, **kwargs): 115 | try: 116 | results = _train(*args, _training_state=_training_state, **kwargs) 117 | return results 118 | except Exception: 119 | raise 120 | finally: 121 | assert len(_training_state.actors) == 2 122 | if not any(a is None for a in _training_state.actors): 123 | actor_infos = ray.state.actors() 124 | actor_nodes = [] 125 | for a in _training_state.actors: 126 | actor_info = actor_infos.get(a._actor_id.hex()) 127 | actor_node = actor_info["Address"]["NodeID"] 128 | actor_nodes.append(actor_node) 129 | assert actor_nodes[0] != actor_nodes[1] 130 | 131 | with patch("xgboost_ray.main._train", _mock_train): 132 | train( 133 | self.params, 134 | RayDMatrix(self.x, self.y), 135 | num_boost_round=4, 136 | ray_params=ray_params, 137 | ) 138 | 139 | def test_tune_pack(self): 140 | """Tests whether workers are packed when using Tune.""" 141 | try: 142 | from ray import tune 143 | except ImportError: 144 | self.skipTest("Tune is not installed.") 145 | return 146 | with self.ray_start_cluster() as cluster: 147 | num_actors = 2 148 | cluster.add_node(num_cpus=3) 149 | cluster.add_node(num_cpus=3) 150 | ray.init(address=cluster.address) 151 | 152 | ray_params = RayParams( 153 | max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1 154 | ) 155 | 156 | def _mock_train(*args, _training_state, **kwargs): 157 | try: 158 | results = _train(*args, _training_state=_training_state, **kwargs) 159 | return results 160 | except Exception: 161 | raise 162 | finally: 163 | assert len(_training_state.actors) == num_actors 164 | if not any(a is None for a in _training_state.actors): 165 | actor_infos = ray.state.actors() 166 | actor_nodes = [] 167 | for a in _training_state.actors: 168 | actor_info = actor_infos.get(a._actor_id.hex()) 169 | actor_node = actor_info["Address"]["NodeID"] 170 | actor_nodes.append(actor_node) 171 | assert actor_nodes[0] == actor_nodes[1] 172 | 173 | def train_func(params, x, y, ray_params): 174 | def inner_func(config): 175 | with patch("xgboost_ray.main._train", _mock_train): 176 | train( 177 | params, 178 | RayDMatrix(x, y), 179 | num_boost_round=4, 180 | ray_params=ray_params, 181 | ) 182 | 183 | return inner_func 184 | 185 | tune.run( 186 | train_func(self.params, self.x, self.y, ray_params), 187 | resources_per_trial=ray_params.get_tune_resources(), 188 | num_samples=1, 189 | ) 190 | 191 | def test_timeout(self): 192 | """Checks that an error occurs when placement group setup times out.""" 193 | os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "5" 194 | 195 | with self.ray_start_cluster() as cluster: 196 | ray.init(address=cluster.address) 197 | 198 | with self.assertRaises(TimeoutError): 199 | train( 200 | self.params, 201 | RayDMatrix(self.x, self.y), 202 | num_boost_round=2, 203 | ray_params=RayParams( 204 | max_actor_restarts=1, 205 | num_actors=2, 206 | resources_per_actor={"invalid": 1}, 207 | ), 208 | ) 209 | 210 | 211 | if __name__ == "__main__": 212 | import sys 213 | 214 | import pytest # noqa: F811 215 | 216 | sys.exit(pytest.main(["-v", __file__])) 217 | -------------------------------------------------------------------------------- /xgboost_ray/tests/test_sklearn_matrix.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import ray 5 | import xgboost as xgb 6 | from packaging.version import Version 7 | from sklearn.model_selection import train_test_split 8 | 9 | from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix 10 | from xgboost_ray.sklearn import RayXGBClassifier, RayXGBRegressor 11 | 12 | has_label_encoder = XGBOOST_VERSION >= Version("1.0.0") and XGBOOST_VERSION < Version( 13 | "1.6.0" 14 | ) 15 | 16 | 17 | class XGBoostRaySklearnMatrixTest(unittest.TestCase): 18 | def setUp(self): 19 | self.seed = 1994 20 | self.rng = np.random.RandomState(self.seed) 21 | self.params = {"n_estimators": 10} 22 | 23 | def tearDown(self) -> None: 24 | if ray.is_initialized(): 25 | ray.shutdown() 26 | 27 | def _init_ray(self): 28 | if not ray.is_initialized(): 29 | ray.init(num_cpus=4) 30 | 31 | @unittest.skipIf( 32 | has_label_encoder, f"not supported in xgb version {xgb.__version__}" 33 | ) 34 | def testClassifierNoLabelEncoder(self, n_class=2): 35 | self._init_ray() 36 | 37 | from sklearn.datasets import load_digits 38 | 39 | digits = load_digits(n_class=n_class) 40 | y = digits["target"] 41 | X = digits["data"] 42 | 43 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) 44 | 45 | train_matrix = RayDMatrix(X_train, y_train) 46 | test_matrix = RayDMatrix(X_test, y_test) 47 | 48 | with self.assertRaisesRegex(Exception, "num_class"): 49 | RayXGBClassifier(**self.params).fit(train_matrix, None) 50 | 51 | with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): 52 | RayXGBClassifier(**self.params).fit( 53 | train_matrix, None, eval_set=[(X_test, y_test)] 54 | ) 55 | 56 | with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): 57 | RayXGBClassifier(**self.params).fit( 58 | X_train, y_train, eval_set=[(test_matrix, "eval")] 59 | ) 60 | 61 | RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None) 62 | 63 | clf = RayXGBClassifier(num_class=n_class, **self.params).fit( 64 | train_matrix, None, eval_set=[(test_matrix, "eval")] 65 | ) 66 | 67 | clf.predict(test_matrix) 68 | clf.predict_proba(test_matrix) 69 | 70 | @unittest.skipIf( 71 | has_label_encoder, f"not supported in xgb version {xgb.__version__}" 72 | ) 73 | def testClassifierMulticlassNoLabelEncoder(self): 74 | self.testClassifierNoLabelEncoder(n_class=3) 75 | 76 | def testRegressor(self): 77 | self._init_ray() 78 | 79 | from sklearn.datasets import fetch_california_housing 80 | 81 | ds = fetch_california_housing() 82 | y = ds["target"] 83 | X = ds["data"] 84 | 85 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) 86 | 87 | train_matrix = RayDMatrix(X_train, y_train) 88 | test_matrix = RayDMatrix(X_test, y_test) 89 | 90 | with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): 91 | RayXGBRegressor(**self.params).fit( 92 | train_matrix, None, eval_set=[(X_test, y_test)] 93 | ) 94 | 95 | with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): 96 | RayXGBRegressor(**self.params).fit( 97 | X_train, y_train, eval_set=[(test_matrix, "eval")] 98 | ) 99 | 100 | RayXGBRegressor(**self.params).fit(train_matrix, None) 101 | 102 | reg = RayXGBRegressor(**self.params).fit( 103 | train_matrix, None, eval_set=[(test_matrix, "eval")] 104 | ) 105 | 106 | reg.predict(test_matrix) 107 | -------------------------------------------------------------------------------- /xgboost_ray/tests/test_tune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | from unittest.mock import MagicMock, patch 6 | 7 | import numpy as np 8 | import ray 9 | from ray import tune 10 | from ray.tune import TuneError 11 | from ray.tune.integration.xgboost import ( 12 | TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, 13 | ) 14 | 15 | from xgboost_ray import RayDMatrix, RayParams, train 16 | from xgboost_ray.tune import TuneReportCheckpointCallback, _try_add_tune_callback 17 | 18 | 19 | class XGBoostRayTuneTest(unittest.TestCase): 20 | def setUp(self): 21 | ray.init(num_cpus=4) 22 | repeat = 8 # Repeat data a couple of times for stability 23 | x = np.array( 24 | [ 25 | [1, 0, 0, 0], # Feature 0 -> Label 0 26 | [0, 1, 0, 0], # Feature 1 -> Label 1 27 | [0, 0, 1, 1], # Feature 2+3 -> Label 2 28 | [0, 0, 1, 0], # Feature 2+!3 -> Label 3 29 | ] 30 | * repeat 31 | ) 32 | y = np.array([0, 1, 2, 3] * repeat) 33 | 34 | self.params = { 35 | "xgb": { 36 | "booster": "gbtree", 37 | "nthread": 1, 38 | "max_depth": 2, 39 | "objective": "multi:softmax", 40 | "num_class": 4, 41 | "eval_metric": ["mlogloss", "merror"], 42 | }, 43 | "num_boost_round": tune.choice([1, 3]), 44 | } 45 | 46 | def train_func( 47 | ray_params, callbacks=None, check_for_spread_strategy=False, **kwargs 48 | ): 49 | def _inner_train(config): 50 | if check_for_spread_strategy: 51 | assert ( 52 | ray.train.get_context().get_trial_resources().strategy 53 | == "SPREAD" 54 | ) 55 | train_set = RayDMatrix(x, y) 56 | train( 57 | config["xgb"], 58 | dtrain=train_set, 59 | ray_params=ray_params, 60 | num_boost_round=config["num_boost_round"], 61 | evals=[(train_set, "train")], 62 | callbacks=callbacks, 63 | **kwargs 64 | ) 65 | 66 | return _inner_train 67 | 68 | self.train_func = train_func 69 | self.experiment_dir = tempfile.mkdtemp() 70 | 71 | def tearDown(self): 72 | ray.shutdown() 73 | shutil.rmtree(self.experiment_dir) 74 | 75 | # noinspection PyTypeChecker 76 | @patch.dict(os.environ, {"TUNE_RESULT_DELIM": "/"}) 77 | def testNumIters(self): 78 | """Test that the number of reported tune results is correct""" 79 | ray_params = RayParams(cpus_per_actor=1, num_actors=2) 80 | params = self.params.copy() 81 | params["num_boost_round"] = tune.grid_search([1, 3]) 82 | 83 | # TODO(justinvyu): Remove this once the xgboost integration 84 | # has been updated on the Ray side. 85 | try: 86 | callback = TuneReportCheckpointCallback( 87 | frequency=1, checkpoint_at_end=False 88 | ) 89 | except TypeError: 90 | callback = TuneReportCheckpointCallback(frequency=1) 91 | 92 | analysis = tune.run( 93 | self.train_func(ray_params, callbacks=[callback]), 94 | config=self.params, 95 | resources_per_trial=ray_params.get_tune_resources(), 96 | num_samples=1, 97 | ) 98 | 99 | self.assertSequenceEqual( 100 | list(analysis.results_df["training_iteration"]), 101 | list(analysis.results_df["config/num_boost_round"]), 102 | ) 103 | 104 | def testNumItersClient(self): 105 | """Test ray client mode""" 106 | if ray.__version__ <= "1.2.0": 107 | self.skipTest("Ray client mocks do not work in Ray <= 1.2.0") 108 | 109 | from ray.util.client.ray_client_helpers import ray_start_client_server 110 | 111 | self.assertFalse(ray.util.client.ray.is_connected()) 112 | with ray_start_client_server(): 113 | self.assertTrue(ray.util.client.ray.is_connected()) 114 | self.testNumIters() 115 | 116 | def testPlacementOptions(self): 117 | ray_params = RayParams( 118 | cpus_per_actor=1, num_actors=1, placement_options={"strategy": "SPREAD"} 119 | ) 120 | tune.run( 121 | self.train_func(ray_params, check_for_spread_strategy=True), 122 | config=self.params, 123 | resources_per_trial=ray_params.get_tune_resources(), 124 | num_samples=1, 125 | ) 126 | 127 | def testElasticFails(self): 128 | """Test if error is thrown when using Tune with elastic training.""" 129 | ray_params = RayParams(cpus_per_actor=1, num_actors=1, elastic_training=True) 130 | with self.assertRaises(TuneError): 131 | tune.run( 132 | self.train_func(ray_params), 133 | config=self.params, 134 | resources_per_trial=ray_params.get_tune_resources(), 135 | num_samples=1, 136 | ) 137 | 138 | def testReplaceTuneCheckpoints(self): 139 | """Test if ray.tune.integration.xgboost callbacks are replaced""" 140 | # Report and checkpointing callback 141 | in_cp = [OrigTuneReportCheckpointCallback(metrics="met")] 142 | in_dict = {"callbacks": in_cp} 143 | 144 | with patch("ray.train.get_context") as mocked: 145 | mocked.return_value = MagicMock(return_value=True) 146 | _try_add_tune_callback(in_dict) 147 | 148 | replaced = in_dict["callbacks"][0] 149 | self.assertTrue(isinstance(replaced, TuneReportCheckpointCallback)) 150 | 151 | self.assertSequenceEqual(replaced._metrics, ["met"]) 152 | 153 | def testEndToEndCheckpointing(self): 154 | ray_params = RayParams(cpus_per_actor=1, num_actors=2) 155 | analysis = tune.run( 156 | self.train_func( 157 | ray_params, callbacks=[TuneReportCheckpointCallback(frequency=1)] 158 | ), 159 | config=self.params, 160 | resources_per_trial=ray_params.get_tune_resources(), 161 | num_samples=1, 162 | metric="train-mlogloss", 163 | mode="min", 164 | log_to_file=True, 165 | local_dir=self.experiment_dir, 166 | ) 167 | 168 | self.assertTrue(os.path.exists(analysis.best_checkpoint.path)) 169 | 170 | def testEndToEndCheckpointingOrigTune(self): 171 | ray_params = RayParams(cpus_per_actor=1, num_actors=2) 172 | analysis = tune.run( 173 | self.train_func( 174 | ray_params, callbacks=[OrigTuneReportCheckpointCallback(frequency=1)] 175 | ), 176 | config=self.params, 177 | resources_per_trial=ray_params.get_tune_resources(), 178 | num_samples=1, 179 | metric="train-mlogloss", 180 | mode="min", 181 | local_dir=self.experiment_dir, 182 | ) 183 | 184 | self.assertTrue(os.path.exists(analysis.best_checkpoint.path)) 185 | 186 | 187 | if __name__ == "__main__": 188 | import sys 189 | 190 | import pytest 191 | 192 | sys.exit(pytest.main(["-v", __file__])) 193 | -------------------------------------------------------------------------------- /xgboost_ray/tests/test_xgboost_api.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | import ray 6 | import xgboost as xgb 7 | 8 | from xgboost_ray import RayDMatrix, RayParams, train 9 | from xgboost_ray.compat import TrainingCallback 10 | 11 | # From XGBoost documentation: 12 | # https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html 13 | from xgboost_ray.session import get_actor_rank, put_queue 14 | 15 | 16 | def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: 17 | y = dtrain.get_label() 18 | return (np.log1p(predt) - np.log1p(y)) / (predt + 1) 19 | 20 | 21 | def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: 22 | y = dtrain.get_label() 23 | return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2) 24 | 25 | 26 | def squared_log( 27 | predt: np.ndarray, dtrain: xgb.DMatrix 28 | ) -> Tuple[np.ndarray, np.ndarray]: 29 | predt[predt < -1] = -1 + 1e-6 30 | grad = gradient(predt, dtrain) 31 | hess = hessian(predt, dtrain) 32 | return grad, hess 33 | 34 | 35 | def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]: 36 | y = dtrain.get_label() 37 | predt[predt < -1] = -1 + 1e-6 38 | elements = np.power(np.log1p(y) - np.log1p(predt), 2) 39 | return "PyRMSLE", float(np.sqrt(np.sum(elements) / len(y))) 40 | 41 | 42 | class XGBoostAPITest(unittest.TestCase): 43 | """This test suite validates core XGBoost API functionality.""" 44 | 45 | def setUp(self): 46 | repeat = 8 # Repeat data a couple of times for stability 47 | self.x = np.array( 48 | [ 49 | [1, 0, 0, 0], # Feature 0 -> Label 0 50 | [0, 1, 0, 0], # Feature 1 -> Label 1 51 | [0, 0, 1, 1], # Feature 2+3 -> Label 0 52 | [0, 0, 1, 0], # Feature 2+!3 -> Label 1 53 | ] 54 | * repeat 55 | ) 56 | self.y = np.array([0, 1, 0, 1] * repeat) 57 | 58 | self.params = { 59 | "booster": "gbtree", 60 | "tree_method": "hist", 61 | "nthread": 1, 62 | "max_depth": 2, 63 | "objective": "binary:logistic", 64 | "seed": 1000, 65 | } 66 | 67 | self.kwargs = {} 68 | 69 | def tearDown(self) -> None: 70 | if ray.is_initialized(): 71 | ray.shutdown() 72 | 73 | def _init_ray(self): 74 | if not ray.is_initialized(): 75 | ray.init(num_cpus=4) 76 | 77 | def testCustomObjectiveFunction(self): 78 | """Ensure that custom objective functions work. 79 | 80 | Runs a custom objective function with pure XGBoost and 81 | XGBoost on Ray and compares the prediction outputs.""" 82 | self._init_ray() 83 | 84 | params = self.params.copy() 85 | params.pop("objective", None) 86 | 87 | bst_xgb = xgb.train(params, xgb.DMatrix(self.x, self.y), obj=squared_log) 88 | 89 | bst_ray = train( 90 | params, 91 | RayDMatrix(self.x, self.y), 92 | ray_params=RayParams(num_actors=2), 93 | obj=squared_log, 94 | **self.kwargs, 95 | ) 96 | 97 | x_mat = xgb.DMatrix(self.x) 98 | pred_y_xgb = np.round(bst_xgb.predict(x_mat)) 99 | pred_y_ray = np.round(bst_ray.predict(x_mat)) 100 | 101 | self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray)) 102 | self.assertSequenceEqual(list(self.y), list(pred_y_ray)) 103 | 104 | def testCustomMetricFunction(self): 105 | """Ensure that custom objective functions work. 106 | 107 | Runs a custom objective function with pure XGBoost and 108 | XGBoost on Ray and compares the prediction outputs.""" 109 | self._init_ray() 110 | 111 | params = self.params.copy() 112 | params.pop("objective", None) 113 | params["disable_default_eval_metric"] = 1 114 | 115 | dtrain_xgb = xgb.DMatrix(self.x, self.y) 116 | evals_result_xgb = {} 117 | bst_xgb = xgb.train( 118 | params, 119 | dtrain_xgb, 120 | obj=squared_log, 121 | feval=rmsle, 122 | evals=[(dtrain_xgb, "dtrain")], 123 | evals_result=evals_result_xgb, 124 | ) 125 | 126 | dtrain_ray = RayDMatrix(self.x, self.y) 127 | evals_result_ray = {} 128 | bst_ray = train( 129 | params, 130 | dtrain_ray, 131 | ray_params=RayParams(num_actors=2), 132 | obj=squared_log, 133 | feval=rmsle, 134 | evals=[(dtrain_ray, "dtrain")], 135 | evals_result=evals_result_ray, 136 | **self.kwargs, 137 | ) 138 | 139 | x_mat = xgb.DMatrix(self.x) 140 | pred_y_xgb = np.round(bst_xgb.predict(x_mat)) 141 | pred_y_ray = np.round(bst_ray.predict(x_mat)) 142 | 143 | self.assertSequenceEqual(list(pred_y_xgb), list(pred_y_ray)) 144 | self.assertSequenceEqual(list(self.y), list(pred_y_ray)) 145 | 146 | self.assertTrue( 147 | np.allclose( 148 | evals_result_xgb["dtrain"]["PyRMSLE"], 149 | evals_result_ray["dtrain"]["PyRMSLE"], 150 | atol=0.1, 151 | ) 152 | ) 153 | 154 | def testCallbacks(self): 155 | class _Callback(TrainingCallback): 156 | def after_iteration(self, model, epoch, evals_log): 157 | print(f"My rank: {get_actor_rank()}") 158 | put_queue(("rank", get_actor_rank())) 159 | 160 | callback = _Callback() 161 | 162 | additional_results = {} 163 | train( 164 | self.params, 165 | RayDMatrix(self.x, self.y), 166 | ray_params=RayParams(num_actors=2), 167 | callbacks=[callback], 168 | additional_results=additional_results, 169 | **self.kwargs, 170 | ) 171 | 172 | self.assertEqual(len(additional_results["callback_returns"]), 2) 173 | self.assertTrue( 174 | all(rank == 0 for (_, rank) in additional_results["callback_returns"][0]) 175 | ) 176 | self.assertTrue( 177 | all(rank == 1 for (_, rank) in additional_results["callback_returns"][1]) 178 | ) 179 | 180 | 181 | if __name__ == "__main__": 182 | import sys 183 | 184 | import pytest 185 | 186 | sys.exit(pytest.main(["-v", __file__])) 187 | -------------------------------------------------------------------------------- /xgboost_ray/tests/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import tempfile 4 | import time 5 | from typing import Dict, List, Optional, Tuple, Union 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import xgboost as xgb 10 | 11 | from xgboost_ray.compat import TrainingCallback 12 | from xgboost_ray.session import get_actor_rank, put_queue 13 | 14 | 15 | def get_num_trees(bst: xgb.Booster): 16 | import json 17 | 18 | data = [json.loads(d) for d in bst.get_dump(dump_format="json")] 19 | return len(data) // 4 20 | 21 | 22 | def create_data(num_rows: int, num_cols: int, dtype: np.dtype = np.float32): 23 | 24 | return pd.DataFrame( 25 | np.random.uniform(0.0, 10.0, size=(num_rows, num_cols)), 26 | columns=[f"feature_{i}" for i in range(num_cols)], 27 | dtype=dtype, 28 | ) 29 | 30 | 31 | def create_labels( 32 | num_rows: int, num_classes: int = 2, dtype: Optional[np.dtype] = None 33 | ): 34 | if num_classes == 0: 35 | # Create regression label 36 | dtype = dtype or np.float32 37 | return pd.Series( 38 | np.random.uniform(0, 1, size=num_rows), dtype=dtype, name="label" 39 | ) 40 | 41 | dtype = dtype or np.int32 42 | return pd.Series( 43 | np.random.randint(0, num_classes, size=num_rows), dtype=dtype, name="label" 44 | ) 45 | 46 | 47 | def create_parquet( 48 | filename: str, 49 | num_rows: int, 50 | num_features: int, 51 | num_classes: int = 2, 52 | num_partitions: int = 1, 53 | ): 54 | 55 | partition_rows = num_rows // num_partitions 56 | for partition in range(num_partitions): 57 | print(f"Creating partition {partition}") 58 | data = create_data(partition_rows, num_features) 59 | labels = create_labels(partition_rows, num_classes) 60 | partition = pd.Series(np.full(partition_rows, partition), dtype=np.int32) 61 | 62 | data["labels"] = labels 63 | data["partition"] = partition 64 | 65 | os.makedirs(filename, 0o755, exist_ok=True) 66 | data.to_parquet( 67 | filename, 68 | partition_cols=["partition"], 69 | engine="pyarrow", 70 | partition_filename_cb=lambda key: f"part_{key[0]}.parquet", 71 | ) 72 | 73 | 74 | def create_parquet_in_tempdir( 75 | filename: str, 76 | num_rows: int, 77 | num_features: int, 78 | num_classes: int = 2, 79 | num_partitions: int = 1, 80 | ) -> Tuple[str, str]: 81 | temp_dir = tempfile.mkdtemp() 82 | path = os.path.join(temp_dir, filename) 83 | create_parquet( 84 | path, 85 | num_rows=num_rows, 86 | num_features=num_features, 87 | num_classes=num_classes, 88 | num_partitions=num_partitions, 89 | ) 90 | return temp_dir, path 91 | 92 | 93 | def flatten_obj(obj: Union[List, Dict], keys=None, base=None): 94 | keys = keys or [] 95 | base = base if base is not None else {} # Keep same object if empty dict 96 | if isinstance(obj, list): 97 | for i, o in enumerate(obj): 98 | flatten_obj(o, keys + [str(i)], base) 99 | elif isinstance(obj, dict): 100 | for k, o in obj.items(): 101 | flatten_obj(o, keys + [str(k)], base) 102 | else: 103 | base["/".join(keys)] = obj 104 | return base 105 | 106 | 107 | def tree_obj(bst: xgb.Booster): 108 | return [json.loads(j) for j in bst.get_dump(dump_format="json")] 109 | 110 | 111 | def _kill_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): 112 | """Returns a callback to kill an actor process. 113 | 114 | Args: 115 | die_lock_file: A file lock used to prevent race conditions 116 | when killing the actor. 117 | actor_rank: The rank of the actor to kill. 118 | fail_iteration: The iteration after which the actor is killed. 119 | 120 | """ 121 | 122 | class _KillCallback(TrainingCallback): 123 | def after_iteration(self, model, epoch, evals_log): 124 | if get_actor_rank() == actor_rank: 125 | put_queue((epoch, time.time())) 126 | if ( 127 | get_actor_rank() == actor_rank 128 | and epoch == fail_iteration 129 | and not os.path.exists(die_lock_file) 130 | ): 131 | 132 | # Get PID 133 | pid = os.getpid() 134 | print(f"Killing process: {pid}") 135 | with open(die_lock_file, "wt") as fp: 136 | fp.write("") 137 | 138 | time.sleep(2) 139 | print(f"Testing: Rank {get_actor_rank()} will now die.") 140 | os.kill(pid, 9) 141 | 142 | return _KillCallback() 143 | 144 | 145 | def _fail_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): 146 | """Returns a callback to cause an Xgboost actor to fail training. 147 | 148 | Args: 149 | die_lock_file: A file lock used to prevent race conditions 150 | when causing the actor to fail. 151 | actor_rank: The rank of the actor to fail. 152 | fail_iteration: The iteration after which the training for 153 | the specified actor fails. 154 | 155 | """ 156 | 157 | class _FailCallback(TrainingCallback): 158 | def after_iteration(self, model, epoch, evals_log): 159 | 160 | if get_actor_rank() == actor_rank: 161 | put_queue((epoch, time.time())) 162 | if ( 163 | get_actor_rank() == actor_rank 164 | and epoch == fail_iteration 165 | and not os.path.exists(die_lock_file) 166 | ): 167 | 168 | with open(die_lock_file, "wt") as fp: 169 | fp.write("") 170 | time.sleep(2) 171 | import sys 172 | 173 | print(f"Testing: Rank {get_actor_rank()} will now fail.") 174 | sys.exit(1) 175 | 176 | return _FailCallback() 177 | 178 | 179 | def _checkpoint_callback(frequency: int = 1, before_iteration_=False): 180 | """Returns a callback to checkpoint a model. 181 | 182 | Args: 183 | frequency: The interval at which checkpointing occurs. If 184 | frequency is set to n, checkpointing occurs every n epochs. 185 | before_iteration_: If True, checkpoint before the iteration 186 | begins. Else, checkpoint after the iteration ends. 187 | 188 | """ 189 | 190 | class _CheckpointCallback(TrainingCallback): 191 | def after_iteration(self, model, epoch, evals_log): 192 | if epoch % frequency == 0: 193 | put_queue(model.save_raw()) 194 | 195 | if before_iteration_: 196 | 197 | def _before_iteration(self, model, epoch, evals_log): 198 | self.after_iteration(model, epoch, evals_log) 199 | 200 | _CheckpointCallback.before_iteration = _before_iteration 201 | 202 | return _CheckpointCallback() 203 | 204 | 205 | def _sleep_callback(sleep_iteration: int = 6, sleep_seconds: int = 5): 206 | """Returns a callback to sleep after an iteration. 207 | 208 | This artificially inflates training time. 209 | 210 | Args: 211 | sleep_iteration: The iteration after which the actor should 212 | sleep. 213 | sleep_seconds: Time in seconds the actor should sleep. 214 | 215 | """ 216 | 217 | class _SleepCallback(TrainingCallback): 218 | def after_iteration(self, model, epoch, evals_log): 219 | if epoch == sleep_iteration: 220 | print( 221 | f"Testing: Rank {get_actor_rank()} will now sleep " 222 | f"for {sleep_seconds} seconds." 223 | ) 224 | time.sleep(sleep_seconds) 225 | 226 | return _SleepCallback() 227 | -------------------------------------------------------------------------------- /xgboost_ray/tune.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, Optional 3 | 4 | import ray 5 | from ray.util.annotations import PublicAPI 6 | 7 | from xgboost_ray.session import get_rabit_rank, put_queue 8 | from xgboost_ray.util import force_on_current_node 9 | from xgboost_ray.xgb import xgboost as xgb 10 | 11 | try: 12 | from ray import train, tune # noqa: F401 13 | except (ImportError, ModuleNotFoundError) as e: 14 | raise RuntimeError( 15 | "Ray Train and Ray Tune are required dependencies of xgboost_ray. " 16 | 'Please install with: `pip install "ray[train]"`' 17 | ) from e 18 | 19 | import ray.train 20 | from ray.tune.integration.xgboost import TuneReportCallback as OrigTuneReportCallback 21 | from ray.tune.integration.xgboost import ( 22 | TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, 23 | ) 24 | 25 | 26 | class TuneReportCheckpointCallback(OrigTuneReportCheckpointCallback): 27 | def after_iteration(self, model, epoch: int, evals_log: Dict): 28 | # NOTE: We need to update `evals_log` here (even though the super method 29 | # already does it) because the actual callback method gets run 30 | # in a different process, so *this* instance of the callback will not have 31 | # access to the `evals_log` dict in `after_training`. 32 | self._evals_log = evals_log 33 | 34 | if get_rabit_rank() == 0: 35 | put_queue( 36 | lambda: super(TuneReportCheckpointCallback, self).after_iteration( 37 | model=model, epoch=epoch, evals_log=evals_log 38 | ) 39 | ) 40 | 41 | def after_training(self, model): 42 | if get_rabit_rank() == 0: 43 | put_queue( 44 | lambda: super(TuneReportCheckpointCallback, self).after_training( 45 | model=model 46 | ) 47 | ) 48 | return model 49 | 50 | 51 | class TuneReportCallback(OrigTuneReportCallback): 52 | def __new__(cls: type, *args, **kwargs): 53 | # TODO(justinvyu): [code_removal] Remove in Ray 2.11. 54 | raise DeprecationWarning( 55 | "`TuneReportCallback` is deprecated. " 56 | "Use `xgboost_ray.tune.TuneReportCheckpointCallback` instead." 57 | ) 58 | 59 | 60 | def _try_add_tune_callback(kwargs: Dict): 61 | ray_train_context_initialized = ( 62 | ray.train.get_context().get_trial_resources() is not None 63 | ) 64 | if ray_train_context_initialized: 65 | callbacks = kwargs.get("callbacks", []) or [] 66 | new_callbacks = [] 67 | has_tune_callback = False 68 | 69 | REPLACE_MSG = ( 70 | "Replaced `{orig}` with `{target}`. If you want to " 71 | "avoid this warning, pass `{target}` as a callback " 72 | "directly in your calls to `xgboost_ray.train()`." 73 | ) 74 | 75 | for cb in callbacks: 76 | if isinstance(cb, TuneReportCheckpointCallback): 77 | has_tune_callback = True 78 | new_callbacks.append(cb) 79 | elif isinstance(cb, OrigTuneReportCheckpointCallback): 80 | orig_metrics = cb._metrics 81 | orig_frequency = cb._frequency 82 | 83 | replace_cb = TuneReportCheckpointCallback( 84 | metrics=orig_metrics, frequency=orig_frequency 85 | ) 86 | new_callbacks.append(replace_cb) 87 | logging.warning( 88 | REPLACE_MSG.format( 89 | orig="ray.tune.integration.xgboost." 90 | "TuneReportCheckpointCallback", 91 | target="xgboost_ray.tune.TuneReportCheckpointCallback", 92 | ) 93 | ) 94 | has_tune_callback = True 95 | else: 96 | new_callbacks.append(cb) 97 | 98 | if not has_tune_callback: 99 | new_callbacks.append(TuneReportCheckpointCallback(frequency=0)) 100 | 101 | kwargs["callbacks"] = new_callbacks 102 | return True 103 | else: 104 | return False 105 | 106 | 107 | def _get_tune_resources( 108 | num_actors: int, 109 | cpus_per_actor: int, 110 | gpus_per_actor: int, 111 | resources_per_actor: Optional[Dict], 112 | placement_options: Optional[Dict], 113 | ): 114 | """Returns object to use for ``resources_per_trial`` with Ray Tune.""" 115 | from ray.tune import PlacementGroupFactory 116 | 117 | head_bundle = {} 118 | child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} 119 | child_bundle_extra = {} if resources_per_actor is None else resources_per_actor 120 | child_bundles = [{**child_bundle, **child_bundle_extra} for _ in range(num_actors)] 121 | bundles = [head_bundle] + child_bundles 122 | placement_options = placement_options or {} 123 | placement_options.setdefault("strategy", "PACK") 124 | placement_group_factory = PlacementGroupFactory(bundles, **placement_options) 125 | 126 | return placement_group_factory 127 | 128 | 129 | @PublicAPI(stability="beta") 130 | def load_model(model_path): 131 | """Loads the model stored in the provided model_path. 132 | 133 | If using Ray Client, this will automatically handle loading the path on 134 | the server by using a Ray task. 135 | 136 | Returns: 137 | xgb.Booster object of the model stored in the provided model_path 138 | 139 | """ 140 | 141 | def load_model_fn(model_path): 142 | best_bst = xgb.Booster() 143 | best_bst.load_model(model_path) 144 | return best_bst 145 | 146 | # Load the model checkpoint. 147 | if ray.util.client.ray.is_connected(): 148 | # If using Ray Client, the best model is saved on the server. 149 | # So we have to wrap the model loading in a ray task. 150 | remote_load = ray.remote(load_model_fn) 151 | remote_load = force_on_current_node(remote_load) 152 | bst = ray.get(remote_load.remote(model_path)) 153 | else: 154 | bst = load_model_fn(model_path) 155 | 156 | return bst 157 | -------------------------------------------------------------------------------- /xgboost_ray/util.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Dict, List, Optional 3 | 4 | import ray 5 | from ray.util.annotations import DeveloperAPI 6 | 7 | 8 | @DeveloperAPI 9 | class Unavailable: 10 | """No object should be instance of this class""" 11 | 12 | def __init__(self): 13 | raise RuntimeError("This class should never be instantiated.") 14 | 15 | 16 | class _EventActor: 17 | def __init__(self): 18 | self._event = asyncio.Event() 19 | 20 | def set(self): 21 | self._event.set() 22 | 23 | def clear(self): 24 | self._event.clear() 25 | 26 | def is_set(self): 27 | return self._event.is_set() 28 | 29 | 30 | @DeveloperAPI 31 | class Event: 32 | def __init__(self, actor_options: Optional[Dict] = None): 33 | actor_options = {} if not actor_options else actor_options 34 | self.actor = ray.remote(_EventActor).options(**actor_options).remote() 35 | 36 | def set(self): 37 | self.actor.set.remote() 38 | 39 | def clear(self): 40 | self.actor.clear.remote() 41 | 42 | def is_set(self): 43 | return ray.get(self.actor.is_set.remote()) 44 | 45 | def shutdown(self): 46 | if self.actor: 47 | ray.kill(self.actor) 48 | self.actor = None 49 | 50 | 51 | @DeveloperAPI 52 | class MultiActorTask: 53 | """Utility class to hold multiple futures. 54 | 55 | The `is_ready()` method will return True once all futures are ready. 56 | 57 | Args: 58 | pending_futures: List of object references (futures) 59 | that should be tracked. 60 | """ 61 | 62 | def __init__(self, pending_futures: Optional[List[ray.ObjectRef]] = None): 63 | self._pending_futures = pending_futures or [] 64 | self._ready_futures = [] 65 | 66 | def is_ready(self): 67 | if not self._pending_futures: 68 | return True 69 | 70 | ready = True 71 | while ready: 72 | ready, not_ready = ray.wait(self._pending_futures, timeout=0) 73 | if ready: 74 | for obj in ready: 75 | self._pending_futures.remove(obj) 76 | self._ready_futures.append(obj) 77 | 78 | return not bool(self._pending_futures) 79 | 80 | 81 | @DeveloperAPI 82 | def get_current_node_resource_key() -> str: 83 | """Get the Ray resource key for current node. 84 | It can be used for actor placement. 85 | If using Ray Client, this will return the resource key for the node that 86 | is running the client server. 87 | """ 88 | current_node_id = ray.get_runtime_context().get_node_id() 89 | for node in ray.nodes(): 90 | if node["NodeID"] == current_node_id: 91 | # Found the node. 92 | for key in node["Resources"].keys(): 93 | if key.startswith("node:"): 94 | return key 95 | else: 96 | raise ValueError("Cannot found the node dictionary for current node.") 97 | 98 | 99 | @DeveloperAPI 100 | def force_on_current_node(task_or_actor): 101 | """Given a task or actor, place it on the current node. 102 | 103 | If the task or actor that is passed in already has custom resource 104 | requirements, then they will be overridden. 105 | 106 | If using Ray Client, the current node is the client server node. 107 | """ 108 | node_resource_key = get_current_node_resource_key() 109 | options = {"resources": {node_resource_key: 0.01}} 110 | return task_or_actor.options(**options) 111 | -------------------------------------------------------------------------------- /xgboost_ray/xgb.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | if TYPE_CHECKING: 4 | import xgboost 5 | else: 6 | try: 7 | import xgboost 8 | except ImportError: 9 | xgboost = None 10 | 11 | __all__ = ["xgboost"] 12 | --------------------------------------------------------------------------------