├── .dockerignore
├── .github
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   └── main.yml
├── .gitignore
├── .hadolint.yaml
├── .pre-commit-config.yaml
├── .yamllint.yaml
├── Dockerfile-cluster
├── Dockerfile-cluster-base
├── Dockerfile-notebook
├── Dockerfile-notebook-base
├── Dockerfile-profiling
├── LICENSE
├── Makefile
├── README.md
├── bin
    ├── install-cmake
    ├── profile-example-memory-usage.sh
    └── profile-examples.sh
├── jupyter_notebook_config.py
├── notebooks
    ├── README.md
    ├── _img
    │   ├── aws.svg
    │   ├── dask-horizontal.svg
    │   └── lightgbm.svg
    ├── demo-aws.ipynb
    ├── demo.ipynb
    └── testing
    │   ├── ranker-local.ipynb
    │   └── sparse-inputs.ipynb
└── pyproject.toml


/.dockerignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !bin/install-cmake
 3 | !bin/profile-examples.sh
 4 | !bin/profile-example-memory-usage.sh
 5 | !jupyter_notebook_config.py
 6 | !LightGBM/build-python.sh
 7 | !LightGBM/cmake
 8 | !LightGBM/CMakeLists.txt
 9 | !LightGBM/external_libs
10 | !LightGBM/include
11 | !LightGBM/lib_lightgbm.so
12 | !LightGBM/LICENSE
13 | !LightGBM/python-package
14 | !LightGBM/src
15 | !LightGBM/swig
16 | !LightGBM/VERSION.txt
17 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: 2
 3 | updates:
 4 |   - package-ecosystem: github-actions
 5 |     directory: /
 6 |     schedule:
 7 |       interval: monthly
 8 |     # group updates in a single PR
 9 |     groups:
10 |       ci-dependencies:
11 |         patterns:
12 |           - "*"
13 |     commit-message:
14 |       prefix: "[ci]"
15 |     labels:
16 |       - maintenance
17 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | 
3 | ## Benefits of this work
4 | 
5 | ## Notes for Reviewers
6 | 
7 | ### How I tested this
8 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | # always run CI on new commits to any branch
 4 | on: push
 5 | 
 6 | jobs:
 7 |   lint:
 8 |     name: lint
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
12 |       - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
13 |   build:
14 |     name: build
15 |     needs: [lint]
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20 |       - name: Set up Docker Buildx
21 |         uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.1.0
22 |         with:
23 |           # 'driver: docker' allows one build to reuse images from a prior build
24 |           # ref: https://github.com/docker/setup-buildx-action/issues/251
25 |           driver: docker
26 |           install: true
27 |           use: true
28 |       - name: Build notebook image
29 |         run: |
30 |           make notebook-image
31 |       - name: Build cluster image
32 |         run: |
33 |           make cluster-image
34 |       - name: Build profiling image
35 |         run: |
36 |           make profiling-image
37 |   all-tests-successful:
38 |     if: always()
39 |     runs-on: ubuntu-latest
40 |     needs:
41 |       - build
42 |       - lint
43 |     steps:
44 |       - name: Decide whether the needed jobs succeeded or failed
45 |         uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # v1.2.2
46 |         with:
47 |           jobs: ${{ toJSON(needs) }}
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.7z
 2 | *.a
 3 | *.bin
 4 | *.buffer
 5 | *.bzip
 6 | *.core
 7 | *.csv
 8 | dask-worker-space/
 9 | *.db
10 | *.dll
11 | *.doc
12 | *.docm
13 | *.docx
14 | .DS_Store
15 | *.egg-info/
16 | *.env
17 | *.exe
18 | .idea/
19 | .ipynb_checkpoints/
20 | *.json
21 | LightGBM/
22 | .mypy_cache/
23 | *.npy
24 | *.o
25 | *.parquet
26 | *.pem
27 | *.pkl
28 | *.ppt
29 | *.pptm
30 | *.pptx
31 | profiling-output/
32 | *.pq
33 | *.pyc
34 | __pycache/
35 | *.query
36 | *.rsa
37 | .ruff_cache/
38 | *.so
39 | *.sqlite
40 | *.tar.gz
41 | *.tgz
42 | *.text
43 | *.train
44 | *.txt
45 | Untitled*.ipynb
46 | *.whl
47 | *.xls
48 | *.xlsm
49 | *.xlsx
50 | *.zip
51 | 


--------------------------------------------------------------------------------
/.hadolint.yaml:
--------------------------------------------------------------------------------
1 | ignored:
2 |   - DL3003  # use WORKDIR instead of cd
3 |   - DL3007  # do not use latest
4 |   - DL3008  # pin versions in apt
5 |   - DL3013  # pin versions in pip
6 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | exclude: |
 3 |   (?x)^(
 4 |       LightGBM
 5 |   )$
 6 | 
 7 | repos:
 8 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 9 |     rev: v5.0.0
10 |     hooks:
11 |       - id: check-toml
12 |       - id: end-of-file-fixer
13 |       - id: trailing-whitespace
14 |   - repo: https://github.com/pre-commit/mirrors-mypy
15 |     rev: v1.15.0
16 |     hooks:
17 |       - id: mypy
18 |         args: ["--config-file", "pyproject.toml"]
19 |         exclude: "tests"
20 |         additional_dependencies:
21 |           - types-requests
22 |   - repo: https://github.com/astral-sh/ruff-pre-commit
23 |     # Ruff version.
24 |     rev: v0.11.6
25 |     hooks:
26 |       # Run the linter.
27 |       - id: ruff
28 |         args: ["--config", "pyproject.toml", "--fix"]
29 |         types_or: [jupyter, python]
30 |       # Run the formatter.
31 |       - id: ruff-format
32 |         args: ["--config", "pyproject.toml"]
33 |         types_or: [python, jupyter]
34 |   - repo: https://github.com/maxwinterstein/shfmt-py
35 |     rev: v3.11.0.2
36 |     hooks:
37 |       - id: shfmt
38 |         args: ["--indent=4", "--space-redirects", "--write"]
39 |   - repo: https://github.com/shellcheck-py/shellcheck-py
40 |     rev: v0.10.0.1
41 |     hooks:
42 |       - id: shellcheck
43 |         args: ["--exclude=SC2002"]
44 |   - repo: https://github.com/adrienverge/yamllint
45 |     rev: v1.37.0
46 |     hooks:
47 |       - id: yamllint
48 |   - repo: https://github.com/codespell-project/codespell
49 |     rev: v2.4.1
50 |     hooks:
51 |       - id: codespell
52 |         additional_dependencies: [tomli]
53 |         args: ["--toml", "pyproject.toml"]
54 | 


--------------------------------------------------------------------------------
/.yamllint.yaml:
--------------------------------------------------------------------------------
 1 | extends: default
 2 | 
 3 | rules:
 4 |   anchors:
 5 |     forbid-undeclared-aliases: true
 6 |     forbid-duplicated-anchors: true
 7 |     forbid-unused-anchors: true
 8 |   braces:
 9 |     forbid: false
10 |     min-spaces-inside: 0
11 |     # allow 1 space for jinja templating in conda recipes
12 |     max-spaces-inside: 1
13 |     min-spaces-inside-empty: -1
14 |     max-spaces-inside-empty: -1
15 |   document-start: disable
16 |   line-length:
17 |     max: 120
18 |   truthy:
19 |     allowed-values: ['false', 'true']
20 |     # having problematic value in keys is rare... and also
21 |     # GitHub Actions' choie of 'on:' triggers this check
22 |     # ref: https://github.com/adrienverge/yamllint/issues/430
23 |     check-keys: false
24 | 


--------------------------------------------------------------------------------
/Dockerfile-cluster:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE=unset
 2 | 
 3 | # hadolint ignore=DL3006
 4 | FROM ${BASE_IMAGE}
 5 | 
 6 | RUN --mount=type=bind,source=LightGBM,target=/tmp/LightGBM,rw \
 7 | <<EOF
 8 | cd /tmp/LightGBM
 9 | 
10 | # build lightgbm package, install to site-packages/
11 | sh ./build-python.sh install --precompile
12 | 
13 | # clear pip's cache
14 | rm -rf ~/.cache
15 | EOF
16 | 


--------------------------------------------------------------------------------
/Dockerfile-cluster-base:
--------------------------------------------------------------------------------
 1 | ARG PYTHON_VERSION=unset
 2 | 
 3 | FROM python:${PYTHON_VERSION}-slim
 4 | 
 5 | ARG CMAKE_VERSION="3.31.3"
 6 | ARG DASK_VERSION=unset
 7 | ARG PYTHON_VERSION=unset
 8 | 
 9 | ENV \
10 |     DEBIAN_FRONTEND=noninteractive \
11 |     LANG=C.UTF-8 \
12 |     LC_ALL=C.UTF-8
13 | 
14 | RUN --mount=type=bind,source=bin,target=/build-bin \
15 | <<EOF
16 | apt-get update
17 | apt-get install -y --no-install-recommends \
18 |     build-essential \
19 |     curl \
20 |     libomp-dev
21 | 
22 | /build-bin/install-cmake "${CMAKE_VERSION}"
23 | 
24 | python -m pip install \
25 |     --no-cache-dir \
26 |     --prefer-binary \
27 |         blosc \
28 |         bokeh \
29 |         "dask==${DASK_VERSION}" \
30 |         'dask-ml>=2024.4.4' \
31 |         "distributed==${DASK_VERSION}" \
32 |         lz4 \
33 |         numpy \
34 |         'pandas>=2.0.0' \
35 |         scikit-learn
36 | 
37 | # remove unnecessary files
38 | find \
39 |     /usr/local/lib/python${PYTHON_VERSION}/site-packages \
40 |     -type f \
41 |     \( \
42 |     -name '*.c'                                 \
43 |     -o -name '*.cc'                             \
44 |     -o -name '*.cpp'                            \
45 |     -o -name '*.h'                              \
46 |     -o -name '*.hpp'                            \
47 |     -o -wholename '*bokeh/sampledata/*'         \
48 |     -o -wholename '*dask/*tests/*'              \
49 |     -o -wholename '*joblib/test/*'              \
50 |     -o -wholename '*llvmlite/tests/*'           \
51 |     -o -wholename '*numba/*tests/*'             \
52 |     -o -wholename '*numpy/*tests/*'             \
53 |     -o -wholename '*pandas/tests*'              \
54 |     -o -wholename '*pandas/*/tests/*'           \
55 |     -o -wholename '*psutil/tests/*'             \
56 |     -o -wholename 'pyarrow/_pyarrow_cpp_tests*' \
57 |     -o -wholename '*scikit-learn/tests*'        \
58 |     -o -wholename '*scikit-learn/*/tests*'      \
59 |     -o -wholename '*sklearn/tests*'             \
60 |     -o -wholename '*sklearn/*/tests*'           \
61 |     -o -wholename '*scipy/*/tests*'             \
62 |     -o -wholename '*sparse/*/tests/*'           \
63 |     -o -wholename '*toolz/tests/*'              \
64 |     -o -wholename '*tornado/test/*'             \
65 |     -o -wholename '*zict/tests/*'               \
66 |     -o -wholename '*/__pycache__/*'             \
67 |     \) \
68 |     -exec rm '{}' '+'
69 | 
70 | find \
71 |     /usr/local/lib/python${PYTHON_VERSION}/site-packages \
72 |     -type d \
73 |     -wholename '*__pycache__*' \
74 |     -exec rm -rf '{}' '+'
75 | 
76 | # clean apt-get files
77 | apt-get clean
78 | apt-get purge -y --auto-remove
79 | rm -rf /var/lib/apt/lists/*
80 | 
81 | # clean other files
82 | rm -rf ~/.cache
83 | EOF
84 | 


--------------------------------------------------------------------------------
/Dockerfile-notebook:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE=unset
 2 | 
 3 | # hadolint ignore=DL3006
 4 | FROM ${BASE_IMAGE}
 5 | 
 6 | COPY jupyter_notebook_config.py /root/.jupyter/jupyter_notebook_config.py
 7 | 
 8 | RUN --mount=type=bind,source=LightGBM,target=/tmp/LightGBM,rw \
 9 | <<EOF
10 | cd /tmp/LightGBM
11 | 
12 | # build lightgbm package, install to site-packages/
13 | sh ./build-python.sh install --precompile
14 | 
15 | # clear pip's cache
16 | rm -rf ~/.cache
17 | EOF
18 | 
19 | WORKDIR /root/testing/notebooks
20 | 


--------------------------------------------------------------------------------
/Dockerfile-notebook-base:
--------------------------------------------------------------------------------
 1 | ARG PYTHON_VERSION=unset
 2 | 
 3 | FROM python:${PYTHON_VERSION}-slim
 4 | 
 5 | ARG CMAKE_VERSION="3.31.3"
 6 | ARG DASK_VERSION=unset
 7 | ARG PYTHON_VERSION=unset
 8 | 
 9 | ENV \
10 |     DEBIAN_FRONTEND=noninteractive \
11 |     LANG=C.UTF-8 \
12 |     LC_ALL=C.UTF-8
13 | 
14 | RUN --mount=type=bind,source=bin,target=/build-bin \
15 | <<EOF
16 | apt-get update
17 | apt-get install -y --no-install-recommends \
18 |     build-essential \
19 |     curl \
20 |     libomp-dev \
21 |     ninja-build
22 | 
23 | /build-bin/install-cmake "${CMAKE_VERSION}"
24 | 
25 | python -m pip install \
26 |     --no-cache-dir \
27 |     --prefer-binary \
28 |         'aiobotocore[awscli,boto3]>=2.5.0' \
29 |         blosc \
30 |         bokeh \
31 |         "dask==${DASK_VERSION}" \
32 |         'dask-cloudprovider[aws]>=2022.10.0' \
33 |         'dask-ml>=2023.3.24' \
34 |         "distributed==${DASK_VERSION}" \
35 |         'jupyterlab>=4.0.2' \
36 |         lz4 \
37 |         numpy \
38 |         'pandas>=2.0.0' \
39 |         scikit-learn
40 | 
41 | # remove unnecessary files
42 | find \
43 |     /usr/local/lib/python${PYTHON_VERSION}/site-packages \
44 |     -type f \
45 |     \( \
46 |     -name '*.c'                                 \
47 |     -o -name '*.cc'                             \
48 |     -o -name '*.cpp'                            \
49 |     -o -name '*.h'                              \
50 |     -o -name '*.hpp'                            \
51 |     -o -wholename '*bokeh/sampledata/*'         \
52 |     -o -wholename '*dask/*tests/*'              \
53 |     -o -wholename '*joblib/test/*'              \
54 |     -o -wholename '*llvmlite/tests/*'           \
55 |     -o -wholename '*numba/*tests/*'             \
56 |     -o -wholename '*numpy/*tests/*'             \
57 |     -o -wholename '*pandas/tests*'              \
58 |     -o -wholename '*pandas/*/tests/*'           \
59 |     -o -wholename '*psutil/tests/*'             \
60 |     -o -wholename 'pyarrow/_pyarrow_cpp_tests*' \
61 |     -o -wholename '*scikit-learn/tests*'        \
62 |     -o -wholename '*scikit-learn/*/tests*'      \
63 |     -o -wholename '*sklearn/tests*'             \
64 |     -o -wholename '*sklearn/*/tests*'           \
65 |     -o -wholename '*scipy/*/tests*'             \
66 |     -o -wholename '*sparse/*/tests/*'           \
67 |     -o -wholename '*toolz/tests/*'              \
68 |     -o -wholename '*tornado/test/*'             \
69 |     -o -wholename '*zict/tests/*'               \
70 |     -o -wholename '*/__pycache__/*'             \
71 |     \) \
72 |     -exec rm '{}' '+'
73 | 
74 | find \
75 |     /usr/local/lib/python${PYTHON_VERSION}/site-packages \
76 |     -type d \
77 |     -wholename '*__pycache__*' \
78 |     -exec rm -rf '{}' '+'
79 | 
80 | # clean apt-get files
81 | apt-get clean
82 | apt-get purge -y --auto-remove
83 | rm -rf /var/lib/apt/lists/*
84 | 
85 | # clean other files
86 | rm -rf ~/.cache
87 | EOF
88 | 
89 | ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--port=8888"]
90 | 


--------------------------------------------------------------------------------
/Dockerfile-profiling:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE=unset
 2 | 
 3 | # hadolint ignore=DL3006
 4 | FROM ${BASE_IMAGE}
 5 | 
 6 | RUN <<EOF
 7 | pip install --no-cache-dir --prefer-binary \
 8 |     memray \
 9 |     pytest \
10 |     pytest-memray \
11 |     pytest-profiling \
12 |     snakeviz
13 | EOF
14 | 
15 | COPY bin/profile-examples.sh /usr/local/bin/profile-examples.sh
16 | COPY bin/profile-example-memory-usage.sh /usr/local/bin/profile-example-memory-usage.sh
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, James Lamb
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # NOTE: using us-east-1 because it is the only region that supports
  2 | #       ECR BatchDeleteImage()
  3 | AWS_REGION=us-east-1
  4 | DASK_VERSION=2024.6.2
  5 | PYTHON_VERSION=3.12
  6 | IMAGE_TAG=py${PYTHON_VERSION}-dask${DASK_VERSION}
  7 | USER_SLUG=$$(echo $${USER} | tr '[:upper:]' '[:lower:]' | tr -cd '[a-zA-Z0-9]-')
  8 | CLUSTER_BASE_IMAGE=lightgbm-dask-testing-cluster-base:${IMAGE_TAG}
  9 | CLUSTER_IMAGE_NAME=lightgbm-dask-testing-cluster-${USER_SLUG}
 10 | CLUSTER_IMAGE=${CLUSTER_IMAGE_NAME}:${IMAGE_TAG}
 11 | FORCE_REBUILD=0
 12 | FORCE_REBUILD_PROFILING_IMAGE=0
 13 | NOTEBOOK_BASE_IMAGE=lightgbm-dask-testing-notebook-base:${IMAGE_TAG}
 14 | NOTEBOOK_IMAGE=lightgbm-dask-testing-notebook:${IMAGE_TAG}
 15 | NOTEBOOK_CONTAINER_NAME=dask-lgb-notebook
 16 | PROFILING_IMAGE=lightgbm-dask-testing-profiling:${IMAGE_TAG}
 17 | 
 18 | LIB_LIGHTGBM=${PWD}/LightGBM/lib_lightgbm.so
 19 | LIGHTGBM_REPO=${PWD}/LightGBM/README.md
 20 | 
 21 | .PHONY: clean
 22 | clean:
 23 | 	docker rmi $$(docker images -q ${CLUSTER_IMAGE}) || true
 24 | 	docker rmi $$(docker images -q ${CLUSTER_BASE_IMAGE}) || true
 25 | 	docker rmi $$(docker images -q ${NOTEBOOK_IMAGE}) || true
 26 | 	docker rmi $$(docker images -q ${NOTEBOOK_BASE_IMAGE}) || true
 27 | 	docker rmi $$(docker images -q ${PROFILING_IMAGE}) || true
 28 | 	rm -rf ./LightGBM/build
 29 | 	rm -f ./LightGBM/lib_lightgbm.so
 30 | 
 31 | .PHONY: cluster-base-image
 32 | cluster-base-image:
 33 | 	@if $$(docker image inspect ${CLUSTER_BASE_IMAGE} > /dev/null); then \
 34 | 		if test ${FORCE_REBUILD} -le 0; then \
 35 | 			echo "image '${CLUSTER_BASE_IMAGE}' already exists. To force rebuilding, run 'make cluster-base-image -e FORCE_REBUILD=1'."; \
 36 | 			exit 0; \
 37 | 		fi; \
 38 | 	fi; \
 39 | 	docker buildx build \
 40 | 		--build-arg DASK_VERSION=${DASK_VERSION} \
 41 | 		--build-arg PYTHON_VERSION=${PYTHON_VERSION} \
 42 | 		--load \
 43 | 		--output type=docker \
 44 | 		-t ${CLUSTER_BASE_IMAGE} \
 45 | 		-f ./Dockerfile-cluster-base \
 46 | 		.
 47 | 	echo "--- docker images ---"
 48 | 	docker images
 49 | 
 50 | .PHONY: cluster-image
 51 | cluster-image: cluster-base-image $(LIB_LIGHTGBM)
 52 | 	docker buildx build \
 53 | 		--build-arg BASE_IMAGE=${CLUSTER_BASE_IMAGE} \
 54 | 		--load \
 55 | 		--output type=docker \
 56 | 		-t ${CLUSTER_IMAGE} \
 57 | 		-f ./Dockerfile-cluster \
 58 | 		.
 59 | 
 60 | .PHONY: create-repo
 61 | create-repo: ecr-details.json
 62 | 
 63 | .PHONY: delete-repo
 64 | delete-repo:
 65 | 	aws --region ${AWS_REGION} \
 66 | 		ecr-public batch-delete-image \
 67 | 			--repository-name ${CLUSTER_IMAGE_NAME} \
 68 | 			--image-ids imageTag=${IMAGE_TAG}
 69 | 	aws --region ${AWS_REGION} \
 70 | 		ecr-public delete-repository \
 71 | 			--repository-name ${CLUSTER_IMAGE_NAME}
 72 | 	rm -f ./ecr-details.json
 73 | 
 74 | ecr-details.json:
 75 | 	aws --region ${AWS_REGION} \
 76 | 		ecr-public create-repository \
 77 | 			--repository-name ${CLUSTER_IMAGE_NAME} \
 78 | 	> ./ecr-details.json
 79 | 
 80 | $(LIGHTGBM_REPO):
 81 | 	git clone --recursive https://github.com/microsoft/LightGBM.git
 82 | 
 83 | $(LIB_LIGHTGBM): $(LIGHTGBM_REPO)
 84 | 	make notebook-base-image
 85 | 	docker run \
 86 | 		--rm \
 87 | 		-v $$(pwd)/LightGBM:/opt/LightGBM \
 88 | 		--workdir=/opt/LightGBM \
 89 | 		--entrypoint="" \
 90 | 		-i ${NOTEBOOK_BASE_IMAGE} \
 91 | 		/bin/bash -cex \
 92 | 			"rm -rf ./build && cmake -B build -S . && cmake --build build --target _lightgbm -j2"
 93 | 
 94 | .PHONY: lightgbm-unit-tests
 95 | lightgbm-unit-tests:
 96 | 	docker run \
 97 | 		--rm \
 98 | 		-v $$(pwd)/LightGBM:/opt/LightGBM \
 99 | 		--workdir=/opt/LightGBM \
100 | 		--entrypoint="" \
101 | 		-i ${CLUSTER_IMAGE} \
102 | 		/bin/bash -cex \
103 | 			"sh ./build-python.sh install --precompile && pip install pytest && pytest -vv -rA tests/python_package_test/test_dask.py"
104 | 
105 | .PHONY: lint-dockerfiles
106 | lint-dockerfiles:
107 | 	for dockerfile in $$(ls | grep -E '^Dockerfile'); do \
108 | 		echo "linting $${dockerfile}" && \
109 | 		docker run \
110 | 			--rm \
111 | 			-v $$(pwd)/.hadolint.yaml:/.config/hadolint.yaml \
112 | 			-i \
113 | 			hadolint/hadolint \
114 | 		< $${dockerfile} || exit 1; \
115 | 	done
116 | 
117 | .PHONY: notebook-base-image
118 | notebook-base-image:
119 | 	@if $$(docker image inspect ${NOTEBOOK_BASE_IMAGE} > /dev/null); then \
120 | 		if test ${FORCE_REBUILD} -le 0; then \
121 | 			echo "image '${NOTEBOOK_BASE_IMAGE}' already exists. To force rebuilding, run 'make notebook-base-image -e FORCE_REBUILD=1'."; \
122 | 			exit 0; \
123 | 		fi; \
124 | 	fi; \
125 | 	docker buildx build \
126 | 		--build-arg DASK_VERSION=${DASK_VERSION} \
127 | 		--build-arg PYTHON_VERSION=${PYTHON_VERSION} \
128 | 		--load \
129 | 		--output type=docker \
130 | 		-t ${NOTEBOOK_BASE_IMAGE} \
131 | 		-f ./Dockerfile-notebook-base \
132 | 		.
133 | 
134 | .PHONY: notebook-image
135 | notebook-image: notebook-base-image $(LIB_LIGHTGBM)
136 | 	docker buildx build \
137 | 		--build-arg BASE_IMAGE=${NOTEBOOK_BASE_IMAGE} \
138 | 		--load \
139 | 		--output type=docker \
140 | 		-t ${NOTEBOOK_IMAGE} \
141 | 		-f ./Dockerfile-notebook \
142 | 		.
143 | 
144 | .PHONY: profile
145 | profile: profiling-image
146 | 	docker run \
147 | 		--rm \
148 | 		-p 8080:8080 \
149 | 		--env LIGHTGBM_HOME=/opt/LightGBM \
150 | 		--env PROFILING_OUTPUT_DIR=/profiling-output \
151 | 		-v $$(pwd)/profiling-output:/profiling-output \
152 | 		-v $$(pwd)/LightGBM:/opt/LightGBM \
153 | 		--workdir=/opt/LightGBM \
154 | 		--entrypoint="" \
155 | 		-i ${PROFILING_IMAGE} \
156 | 		/bin/bash -cex \
157 | 			'/bin/bash /usr/local/bin/profile-examples.sh && python -m snakeviz /profiling-output/ --hostname 0.0.0.0 --server'
158 | 
159 | .PHONY: profiling-image
160 | profiling-image: cluster-image
161 | 	@if $$(docker image inspect ${PROFILING_IMAGE} > /dev/null); then \
162 | 		if test ${FORCE_REBUILD_PROFILING_IMAGE} -le 0; then \
163 | 			echo "image '${PROFILING_IMAGE}' already exists. To force rebuilding, run 'make profiling-image -e FORCE_REBUILD_PROFILING_IMAGE=1'."; \
164 | 			exit 0; \
165 | 		fi; \
166 | 	fi && \
167 | 	docker buildx build \
168 | 		--build-arg BASE_IMAGE=${CLUSTER_IMAGE} \
169 | 		--load \
170 | 		--output type=docker \
171 | 		-t ${PROFILING_IMAGE} \
172 | 		-f ./Dockerfile-profiling \
173 | 		.
174 | 
175 | .PHONY: profile-memory-usage
176 | profile-memory-usage: profiling-image
177 | 	docker run \
178 | 		--rm \
179 | 		--env LIGHTGBM_HOME=/opt/LightGBM \
180 | 		--env PROFILING_OUTPUT_DIR=/profiling-output/memory-usage \
181 | 		-v $$(pwd)/profiling-output:/profiling-output \
182 | 		-v $$(pwd)/LightGBM:/opt/LightGBM \
183 | 		--workdir=/opt/LightGBM \
184 | 		--entrypoint="" \
185 | 		-i ${PROFILING_IMAGE} \
186 | 		/bin/bash -cex \
187 | 			'/bin/bash /usr/local/bin/profile-example-memory-usage.sh'
188 | 
189 | # https://docs.amazonaws.cn/en_us/AmazonECR/latest/public/docker-push-ecr-image.html
190 | .PHONY: push-image
191 | push-image: create-repo
192 | 	aws ecr-public get-login-password \
193 | 		--region ${AWS_REGION} \
194 | 	| docker login \
195 | 		--username AWS \
196 | 		--password-stdin public.ecr.aws
197 | 	docker tag \
198 | 		${CLUSTER_IMAGE_NAME}:${IMAGE_TAG} \
199 | 		$$(cat ./ecr-details.json | jq .'repository'.'repositoryUri' | tr -d '"'):${IMAGE_TAG}
200 | 	docker push \
201 | 		$$(cat ./ecr-details.json | jq .'repository'.'repositoryUri' | tr -d '"'):${IMAGE_TAG}
202 | 
203 | # NOTE: IMAGE_TAG is in the environment here so the AWS notebooks
204 | #       know what image to use for the Dask cluster
205 | .PHONY: start-notebook
206 | start-notebook:
207 | 	docker run \
208 | 		--rm \
209 | 		-v $$(pwd):/root/testing \
210 | 		--env AWS_ACCESS_KEY_ID=$${AWS_ACCESS_KEY_ID:-notset} \
211 | 		--env AWS_DEFAULT_REGION=${AWS_REGION} \
212 | 		--env AWS_SECRET_ACCESS_KEY=$${AWS_SECRET_ACCESS_KEY:-notset} \
213 | 		--env IMAGE_TAG=${IMAGE_TAG} \
214 | 		-p 8888:8888 \
215 | 		-p 8787:8787 \
216 | 		--name ${NOTEBOOK_CONTAINER_NAME} \
217 | 		${NOTEBOOK_IMAGE}
218 | 
219 | .PHONY: stop-notebook
220 | stop-notebook:
221 | 	@docker kill ${NOTEBOOK_CONTAINER_NAME}
222 | 	@docker rm ${NOTEBOOK_CONTAINER_NAME}
223 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Testing `lightgbm.dask`
  2 | 
  3 | [![GitHub Actions](https://github.com/jameslamb/lightgbm-dask-testing/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/jameslamb/lightgbm-dask-testing/actions/workflows/main.yml)
  4 | 
  5 | This repository can be used to test and develop changes to LightGBM's Dask integration.
  6 | It contains the following useful features:
  7 | 
  8 | * `make` recipes for building a local development image with `lightgbm` installed from a local copy, and Jupyter Lab running for interactive development
  9 | * Jupyter notebooks for testing `lightgbm.dask` against a `LocalCluster` (multi-worker, single-machine) and a `dask_cloudprovider.aws.FargateCluster` (multi-worker, multi-machine)
 10 | * `make` recipes for publishing a custom container image to ECR Public repository, for use with AWS Fargate
 11 | 
 12 | <hr>
 13 | 
 14 | **Contents**
 15 | 
 16 | - [Getting Started](#getting-started)
 17 | - [Develop in Jupyter](#develop-in-jupyter)
 18 | - [Test with a LocalCluster](#test-with-a-localcluster)
 19 | - [Test with a FargateCluster](#test-with-a-fargatecluster)
 20 | - [Run LightGBM unit tests](#run-lightgbm-unit-tests)
 21 | - [Profile LightGBM code](#profiling)
 22 |     - [runtime profiling](#runtime-profiling)
 23 | 
 24 | ## Getting Started
 25 | 
 26 | To begin, clone a copy of LightGBM to a folder `LightGBM` at the root of this repo.
 27 | You can do this however you want, for example:
 28 | 
 29 | ```shell
 30 | git clone \
 31 |     --recursive \
 32 |     git@github.com:microsoft/LightGBM.git \
 33 |     ./LightGBM
 34 | ```
 35 | 
 36 | If you're developing a reproducible example for [an issue](https://github.com/microsoft/LightGBM/issues) or you're testing a potential [pull request](https://github.com/microsoft/LightGBM/pulls), you probably want to clone LightGBM from your fork, instead of the main repo.
 37 | 
 38 | <hr>
 39 | 
 40 | ## Develop in Jupyter
 41 | 
 42 | This section describes how to test a version of LightGBM in Jupyter.
 43 | 
 44 | #### 1. Build the notebook image
 45 | 
 46 | Run the following to build an image that includes `lightgbm`, all its dependencies, and a JupyterLab setup.
 47 | 
 48 | ```shell
 49 | make notebook-image
 50 | ```
 51 | 
 52 | The first time you run this, it will take a few minutes as this project needs to build a base image with LightGBM's dependencies and needs to compile the LightGBM C++ library.
 53 | 
 54 | Every time after that, `make notebook-image` should run very quickly.
 55 | 
 56 | #### 2. Run a notebook locally
 57 | 
 58 | Start up Jupyter Lab!
 59 | This command will run Jupyter Lab in a container using the image you built with `make notebook-image`.
 60 | 
 61 | ```shell
 62 | make start-notebook
 63 | ```
 64 | 
 65 | Navigate to `http://127.0.0.1:8888/lab` in your web browser.
 66 | 
 67 | The command `make start-notebook` mounts your current working directory into the running container.
 68 | That means that even though Jupyter Lab is running inside the container, changes that you make in it will be saved on your local filesystem even after you shut the container down.
 69 | So you can edit and create notebooks and other code in there with confidence!
 70 | 
 71 | When you're done with the notebook, stop the container by running the following from another shell:
 72 | 
 73 | ```shell
 74 | make stop-notebook
 75 | ```
 76 | 
 77 | <hr>
 78 | 
 79 | ## Test with a `LocalCluster`
 80 | 
 81 | To test `lightgbm.dask` on a `LocalCluster`, run the steps in ["Develop in Jupyter"](#develop-in-jupyter), then try out [`local.ipynb`](./notebooks/local-cluster.ipynb) or your own notebooks.
 82 | 
 83 | <hr>
 84 | 
 85 | ## Test with a `FargateCluster`
 86 | 
 87 | There are some problems with Dask code which only arise in a truly distributed, multi-machine setup.
 88 | To test for these sorts of issues, I like to use [`dask-cloudprovider`](https://github.com/dask/dask-cloudprovider).
 89 | 
 90 | The steps below describe how to test a local copy of LightGBM on a `FargateCluster` from `dask-cloudprovider`.
 91 | 
 92 | #### 1. Build the cluster image
 93 | 
 94 | Build an image that can be used for the scheduler and works in the Dask cluster you'll create on AWS Fargate.
 95 | This image will have your local copy of LightGBM installed in it.
 96 | 
 97 | ```shell
 98 | make cluster-image
 99 | ```
100 | 
101 | #### 2. Install and configure the AWS CLI
102 | 
103 | For the rest of the steps in this section, you'll need access to AWS resources.
104 | To begin, install the AWS CLI if you don't already have it.
105 | 
106 | ```shell
107 | pip install --upgrade awscli
108 | ```
109 | 
110 | Next, configure your shell to make authenticated requests to AWS.
111 | If you've never done this, you can see [the AWS CLI docs](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
112 | 
113 | The rest of this section assumes that the shell variables `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` have been sett.
114 | 
115 | I like to set these by keeping them in a file
116 | 
117 | ```text
118 | # file: aws.env
119 | AWS_SECRET_ACCESS_KEY=your-key-here
120 | AWS_ACCESS_KEY_ID=your-access-key-id-here
121 | ```
122 | 
123 | and then sourcing that file
124 | 
125 | ```shell
126 | set -o allexport
127 | source aws.env
128 | set +o allexport
129 | ```
130 | 
131 | #### 3. Push the cluster image to ECR
132 | 
133 | To use the cluster image in the containers you spin up on Fargate, it has to be available in a container registry.
134 | This project uses the free AWS Elastic Container Registry (ECR) Public.
135 | For more information on ECR Public, see [the AWS docs](https://docs.amazonaws.cn/en_us/AmazonECR/latest/public/docker-push-ecr-image.html).
136 | 
137 | The command below will create a new repository on ECR Public, store the details of that repository in a file `ecr-details.json`, and push the cluster image to it.
138 | The cluster image will not contain your credentials, notebooks, or other local files.
139 | 
140 | ```shell
141 | make push-image
142 | ```
143 | 
144 | This may take a few minutes to complete.
145 | 
146 | #### 4. Run the AWS notebook
147 | 
148 | Follow the steps in ["Develop in Jupyter"](#develop-in-jupyter) to get a local Jupyter Lab running.
149 | Open [`aws.ipynb`](./notebooks/fargate-cluster.ipynb).
150 | That notebook contains sample code that uses `dask-cloudprovider` to provision a Dask cluster on AWS Fargate.
151 | 
152 | You can view the cluster's current state and its logs by navigating to the Elastic Container Service (ECS) section of the AWS console.
153 | 
154 | #### 5. Clean Up
155 | 
156 | As you work on whatever experiment you're doing, you'll probably find yourself wanting to repeat these steps multiple times.
157 | 
158 | To remove the image you pushed to ECR Public and the repository you created there, run the following
159 | 
160 | ```shell
161 | make delete-repo
162 | ```
163 | 
164 | Then, repeat the steps above to rebuild your images and test again.
165 | 
166 | <hr>
167 | 
168 | ## Run LightGBM unit tests
169 | 
170 | This repo makes it easy to run `lightgbm`'s Dask unit tests in a containerized setup.
171 | 
172 | ```shell
173 | make lightgbm-unit-tests
174 | ```
175 | 
176 | Pass variable `DASK_VERSION` to use a different version of `dask` / `distributed`.
177 | 
178 | ```shell
179 | make lightgbm-unit-tests \
180 |     -e DASK_VERSION=2024.12.0
181 | ```
182 | 
183 | ## Profile LightGBM code <a name="profiling"></a>
184 | 
185 | ### runtime profiling
186 | 
187 | To try to identify expensive parts of the code path for `lightgbm`, you can run its examples under `cProfile` ([link](https://docs.python.org/3/library/profile.html)) and then visualize those profiling results with `snakeviz` ([link](https://jiffyclub.github.io/snakeviz/)).
188 | 
189 | ```shell
190 | make profile
191 | ```
192 | 
193 | Then navigate to `http://0.0.0.0:8080/snakeviz/%2Fprofiling-output` in your web browser.
194 | 
195 | ### memory profiling
196 | 
197 | To summarize memory allocations in typical uses of LightGBM, and to attribute those memory allocations to particular codepaths, you can run its examples under `memray` ([link](https://github.com/bloomberg/memray)).
198 | 
199 | ```shell
200 | make profile-memory-usage
201 | ```
202 | 
203 | That will generate a bunch of HTML files.
204 | View them in your browser by running the following, then navigating to `localhost:1234`.
205 | 
206 | ```shell
207 | python -m http.server \
208 |     --directory ./profiling-output/memory-usage \
209 |     1234
210 | ```
211 | 
212 | ## Useful Links
213 | 
214 | * https://github.com/microsoft/LightGBM/pull/3515
215 | * https://docs.aws.amazon.com/cli/latest/reference/ecr-public/
216 | * https://docs.amazonaws.cn/en_us/AmazonECR/latest/public/docker-push-ecr-image.html
217 | * https://github.com/dask/dask-docker
218 | * https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
219 | 


--------------------------------------------------------------------------------
/bin/install-cmake:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e -u -o pipefail
 4 | 
 5 | CMAKE_VERSION=${1}
 6 | 
 7 | install_script="cmake-${CMAKE_VERSION}-linux-$(arch).sh"
 8 | 
 9 | curl -O -L \
10 |     "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${install_script}"
11 | 
12 | mkdir /opt/cmake
13 | sh "${install_script}" \
14 |     --skip-license \
15 |     --prefix=/opt/cmake
16 | 
17 | rm "./${install_script}"
18 | 
19 | ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake
20 | 


--------------------------------------------------------------------------------
/bin/profile-example-memory-usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # [description]
 4 | #
 5 | #     Profile memory usage of all of LightGBM's Python examples, using memray.
 6 | 
 7 | set -e -u -o pipefail
 8 | 
 9 | echo "profiling examples"
10 | mkdir -p "${PROFILING_OUTPUT_DIR}/bin"
11 | 
12 | # shellcheck disable=SC2044
13 | for py_script in $(find "${LIGHTGBM_HOME}/examples/python-guide" -name '*.py'); do
14 |     base_filename=$(basename "${py_script}")
15 |     prof_file="${base_filename/.py/.bin}"
16 |     table_file="${base_filename/.py/-table.html}"
17 |     leak_table_file="${base_filename/.py/-leak-table.html}"
18 |     flamegraph_file="${base_filename/.py/-flamegraph.html}"
19 |     echo "  - ${base_filename}"
20 |     memray run \
21 |         -o "${PROFILING_OUTPUT_DIR}/bin/${prof_file}" \
22 |         "${py_script}" > /dev/null 2>&1 ||
23 |         true
24 |     memray table \
25 |         -o "${PROFILING_OUTPUT_DIR}/${table_file}" \
26 |         --force \
27 |         "${PROFILING_OUTPUT_DIR}/bin/${prof_file}"
28 |     memray table \
29 |         -o "${PROFILING_OUTPUT_DIR}/${leak_table_file}" \
30 |         --force \
31 |         --leaks \
32 |         "${PROFILING_OUTPUT_DIR}/bin/${prof_file}"
33 |     memray flamegraph \
34 |         -o "${PROFILING_OUTPUT_DIR}/${flamegraph_file}" \
35 |         --force \
36 |         "${PROFILING_OUTPUT_DIR}/bin/${prof_file}"
37 | done
38 | echo "Done profiling examples. See '${PROFILING_OUTPUT_DIR}' for results."
39 | 


--------------------------------------------------------------------------------
/bin/profile-examples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # [description]
 4 | #
 5 | #     Profile all of LightGBM's Python examples, using cProfile.
 6 | 
 7 | set -e -u -o pipefail
 8 | 
 9 | echo "profiling examples"
10 | # shellcheck disable=SC2044
11 | for py_script in $(find "${LIGHTGBM_HOME}/examples/python-guide" -name '*.py'); do
12 |     base_filename=$(basename "${py_script}")
13 |     prof_file="${base_filename/.py/.prof}"
14 |     echo "  - ${base_filename}"
15 |     python \
16 |         -Wignore \
17 |         -m cProfile \
18 |         -o "${PROFILING_OUTPUT_DIR}/${prof_file}" \
19 |         "${py_script}" > /dev/null 2>&1 ||
20 |         true
21 | done
22 | echo "Done profiling examples. See '${PROFILING_OUTPUT_DIR}' for results."
23 | 


--------------------------------------------------------------------------------
/jupyter_notebook_config.py:
--------------------------------------------------------------------------------
1 | # mypy: disable-error-code="name-defined"
2 | c.ServerApp.token = ""
3 | c.ServerApp.password = ""
4 | c.ServerApp.open_browser = False
5 | c.ServerApp.ip = "localhost"
6 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # notebooks
 2 | 
 3 | This directory contains notebooks used to test `lightgbm.dask`.
 4 | 
 5 | The following notebooks can be used as tutorials for running machine learning workflows with LightGBM using Dask.
 6 | 
 7 | * [demo.ipynb](./demo.ipynb) - Minimal example of training a regression model on a `LocalCluster`.
 8 | * [demo-aws.ipynb](./demo-aws.ipynb) - Minimal example of training a regression model on AWS Fargate, using `dask-cloudprovider`
 9 | 
10 | ## Other notebooks in this section
11 | 
12 | `testing/` contains random notebooks used to test pull requests and issues on LightGBM. Everything in that folder should be considered temporary and experimental.
13 | 


--------------------------------------------------------------------------------
/notebooks/_img/aws.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 304 182" style="enable-background:new 0 0 304 182;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#252F3E;}
 7 | 	.st1{fill-rule:evenodd;clip-rule:evenodd;fill:#FF9900;}
 8 | </style>
 9 | <g>
10 | 	<path class="st0" d="M86.4,66.4c0,3.7,0.4,6.7,1.1,8.9c0.8,2.2,1.8,4.6,3.2,7.2c0.5,0.8,0.7,1.6,0.7,2.3c0,1-0.6,2-1.9,3l-6.3,4.2
11 | 		c-0.9,0.6-1.8,0.9-2.6,0.9c-1,0-2-0.5-3-1.4C76.2,90,75,88.4,74,86.8c-1-1.7-2-3.6-3.1-5.9c-7.8,9.2-17.6,13.8-29.4,13.8
12 | 		c-8.4,0-15.1-2.4-20-7.2c-4.9-4.8-7.4-11.2-7.4-19.2c0-8.5,3-15.4,9.1-20.6c6.1-5.2,14.2-7.8,24.5-7.8c3.4,0,6.9,0.3,10.6,0.8
13 | 		c3.7,0.5,7.5,1.3,11.5,2.2v-7.3c0-7.6-1.6-12.9-4.7-16c-3.2-3.1-8.6-4.6-16.3-4.6c-3.5,0-7.1,0.4-10.8,1.3c-3.7,0.9-7.3,2-10.8,3.4
14 | 		c-1.6,0.7-2.8,1.1-3.5,1.3c-0.7,0.2-1.2,0.3-1.6,0.3c-1.4,0-2.1-1-2.1-3.1v-4.9c0-1.6,0.2-2.8,0.7-3.5c0.5-0.7,1.4-1.4,2.8-2.1
15 | 		c3.5-1.8,7.7-3.3,12.6-4.5c4.9-1.3,10.1-1.9,15.6-1.9c11.9,0,20.6,2.7,26.2,8.1c5.5,5.4,8.3,13.6,8.3,24.6V66.4z M45.8,81.6
16 | 		c3.3,0,6.7-0.6,10.3-1.8c3.6-1.2,6.8-3.4,9.5-6.4c1.6-1.9,2.8-4,3.4-6.4c0.6-2.4,1-5.3,1-8.7v-4.2c-2.9-0.7-6-1.3-9.2-1.7
17 | 		c-3.2-0.4-6.3-0.6-9.4-0.6c-6.7,0-11.6,1.3-14.9,4c-3.3,2.7-4.9,6.5-4.9,11.5c0,4.7,1.2,8.2,3.7,10.6
18 | 		C37.7,80.4,41.2,81.6,45.8,81.6z M126.1,92.4c-1.8,0-3-0.3-3.8-1c-0.8-0.6-1.5-2-2.1-3.9L96.7,10.2c-0.6-2-0.9-3.3-0.9-4
19 | 		c0-1.6,0.8-2.5,2.4-2.5h9.8c1.9,0,3.2,0.3,3.9,1c0.8,0.6,1.4,2,2,3.9l16.8,66.2l15.6-66.2c0.5-2,1.1-3.3,1.9-3.9c0.8-0.6,2.2-1,4-1
20 | 		h8c1.9,0,3.2,0.3,4,1c0.8,0.6,1.5,2,1.9,3.9l15.8,67l17.3-67c0.6-2,1.3-3.3,2-3.9c0.8-0.6,2.1-1,3.9-1h9.3c1.6,0,2.5,0.8,2.5,2.5
21 | 		c0,0.5-0.1,1-0.2,1.6c-0.1,0.6-0.3,1.4-0.7,2.5l-24.1,77.3c-0.6,2-1.3,3.3-2.1,3.9c-0.8,0.6-2.1,1-3.8,1h-8.6c-1.9,0-3.2-0.3-4-1
22 | 		c-0.8-0.7-1.5-2-1.9-4L156,23l-15.4,64.4c-0.5,2-1.1,3.3-1.9,4c-0.8,0.7-2.2,1-4,1H126.1z M254.6,95.1c-5.2,0-10.4-0.6-15.4-1.8
23 | 		c-5-1.2-8.9-2.5-11.5-4c-1.6-0.9-2.7-1.9-3.1-2.8c-0.4-0.9-0.6-1.9-0.6-2.8v-5.1c0-2.1,0.8-3.1,2.3-3.1c0.6,0,1.2,0.1,1.8,0.3
24 | 		c0.6,0.2,1.5,0.6,2.5,1c3.4,1.5,7.1,2.7,11,3.5c4,0.8,7.9,1.2,11.9,1.2c6.3,0,11.2-1.1,14.6-3.3c3.4-2.2,5.2-5.4,5.2-9.5
25 | 		c0-2.8-0.9-5.1-2.7-7c-1.8-1.9-5.2-3.6-10.1-5.2L246,52c-7.3-2.3-12.7-5.7-16-10.2c-3.3-4.4-5-9.3-5-14.5c0-4.2,0.9-7.9,2.7-11.1
26 | 		c1.8-3.2,4.2-6,7.2-8.2c3-2.3,6.4-4,10.4-5.2c4-1.2,8.2-1.7,12.6-1.7c2.2,0,4.5,0.1,6.7,0.4c2.3,0.3,4.4,0.7,6.5,1.1
27 | 		c2,0.5,3.9,1,5.7,1.6c1.8,0.6,3.2,1.2,4.2,1.8c1.4,0.8,2.4,1.6,3,2.5c0.6,0.8,0.9,1.9,0.9,3.3v4.7c0,2.1-0.8,3.2-2.3,3.2
28 | 		c-0.8,0-2.1-0.4-3.8-1.2c-5.7-2.6-12.1-3.9-19.2-3.9c-5.7,0-10.2,0.9-13.3,2.8c-3.1,1.9-4.7,4.8-4.7,8.9c0,2.8,1,5.2,3,7.1
29 | 		c2,1.9,5.7,3.8,11,5.5l14.2,4.5c7.2,2.3,12.4,5.5,15.5,9.6c3.1,4.1,4.6,8.8,4.6,14c0,4.3-0.9,8.2-2.6,11.6
30 | 		c-1.8,3.4-4.2,6.4-7.3,8.8c-3.1,2.5-6.8,4.3-11.1,5.6C264.4,94.4,259.7,95.1,254.6,95.1z"/>
31 | 	<g>
32 | 		<path class="st1" d="M273.5,143.7c-32.9,24.3-80.7,37.2-121.8,37.2c-57.6,0-109.5-21.3-148.7-56.7c-3.1-2.8-0.3-6.6,3.4-4.4
33 | 			c42.4,24.6,94.7,39.5,148.8,39.5c36.5,0,76.6-7.6,113.5-23.2C274.2,133.6,278.9,139.7,273.5,143.7z"/>
34 | 		<path class="st1" d="M287.2,128.1c-4.2-5.4-27.8-2.6-38.5-1.3c-3.2,0.4-3.7-2.4-0.8-4.5c18.8-13.2,49.7-9.4,53.3-5
35 | 			c3.6,4.5-1,35.4-18.6,50.2c-2.7,2.3-5.3,1.1-4.1-1.9C282.5,155.7,291.4,133.4,287.2,128.1z"/>
36 | 	</g>
37 | </g>
38 | </svg>
39 | 


--------------------------------------------------------------------------------
/notebooks/_img/dask-horizontal.svg:
--------------------------------------------------------------------------------
 1 | <svg id="Layer_1"
 2 |      data-name="Layer 1"
 3 |      xmlns="http://www.w3.org/2000/svg"
 4 |      xmlns:xlink="http://www.w3.org/1999/xlink"
 5 |      viewBox="0 0 550 247.95">
 6 |   <defs>
 7 |     <linearGradient id="linear-gradient" x1="154.55" y1="173.33" x2="242.36" y2="173.33" gradientTransform="translate(-26.62 -73.73) rotate(7.91)" gradientUnits="userSpaceOnUse">
 8 |       <stop offset="0.01" stop-color="#c7422f" />
 9 |       <stop offset="0.37" stop-color="#d46e43" />
10 |       <stop offset="1" stop-color="#eeb575" />
11 |     </linearGradient>
12 |     <linearGradient id="linear-gradient-2" x1="181.83" y1="171.07" x2="221.39" y2="171.07" gradientTransform="translate(-26.62 -73.73) rotate(7.91)" gradientUnits="userSpaceOnUse">
13 |       <stop offset="0.21" stop-color="#cf603b" />
14 |       <stop offset="1" stop-color="#eeb575" />
15 |     </linearGradient>
16 |     <linearGradient id="linear-gradient-3" x1="107.2" y1="175.53" x2="204.37" y2="175.53" xlink:href="#linear-gradient-2" />
17 |   </defs>
18 | 
19 |   <title>Dask</title>
20 | 
21 |   <path d="M214.33,85.8h36.1c24.73,0.28,30.29,21.36,30.29,40,0,39.54-20.35,39.14-30.29,39.14h-36.1V85.8Zm34.74,64.52c12.94,0,15.6-11.66,15.6-24.55,0-18.13-6.31-25.38-15.6-25.38h-18.7v49.93h18.7Z" style="fill:#101011" />
22 |   <path d="M311.7,86.08h18.46l27.18,78.84h-17L333.87,147H306.31l-6.92,17.88h-17Zm18.18,47.34-9.09-26.87-10,26.87h19Z" style="fill:#101011" />
23 |   <path d="M364.71,106.58c0-11.68,4.7-20.91,20.54-20.91,7.19,0,27.91,1.13,38.24,3v11.76s-21.49-.8-35.82-0.8c-5.45,0-6.91,3.45-6.91,7.19v5.89c0,6.07,2.67,6.19,6.91,6.19h20.61c14,0,18.95,8.92,18.95,20v7.75c0,15.95-9.55,19.64-18.95,19.64-6.58,0-35.16-.8-40.53-3.24V151.58s23.47,0.61,37.14.61c4.9,0,6.29-5.82,6.29-5.82v-7c0-4-1.56-5.91-6.29-5.91H384.76c-14.5,0-20.05-6.54-20.05-19.81v-7.1Z" style="fill:#101011" />
24 |   <path d="M438.85,86.08h15.56v33.19h8.64l22.75-33.19h18.67l-27.35,40.11,27.35,38.87H485.81l-23-31.67h-8.37v31.67H438.85v-79Z" style="fill:#101011" />
25 |   <path d="M192.41,110.26q0.17-1.83.29-3.66a119.55,119.55,0,0,0-12.24-60.92L173.64,32l-3.16,15a109,109,0,0,1-64.2,77.79l-4.69,2,1.78,4.78a107.9,107.9,0,0,1,6.31,48.3A109.44,109.44,0,0,1,104,205.75L100.36,216l10.28-3.38A119.71,119.71,0,0,0,192.41,110.26ZM122.68,196l-5.48,2.79,1.2-6a120.35,120.35,0,0,0,1.75-12,118.87,118.87,0,0,0-4.52-45.89l-0.73-2.41,2.25-1.12A120.24,120.24,0,0,0,173.4,72.22l3.51-8L179,72.7a107.63,107.63,0,0,1,2.69,36.54A108.48,108.48,0,0,1,122.68,196Z" style="fill:url(#linear-gradient)" />
26 |   <path d="M166.91,116.14c4.13-9.12,8.42-31.77,8.15-33.46A126,126,0,0,1,160,105.07c-0.85,2.24-1.74,4.47-2.74,6.67h0a108.87,108.87,0,0,1-31.62,40.47q0.85,6.14,1.07,12.36A119.4,119.4,0,0,0,166.91,116.14Z" style="fill:url(#linear-gradient-2)" />
27 |   <path d="M104.08,165.48a109,109,0,0,1-30.87,9.17l-6.08.86,3.08-5.31a120.74,120.74,0,0,0,5.54-10.74,118.79,118.79,0,0,0,10.62-44.87l0.09-2.51,2.49-.33a120.18,120.18,0,0,0,54-21.47A102.79,102.79,0,0,0,161.57,56.9a109,109,0,0,1-80.84,45l-5.08.35,0.13,5.1a107.92,107.92,0,0,1-9.72,47.73,109.43,109.43,0,0,1-13.83,22.67l-6.71,8.49,10.82,0.14a119.25,119.25,0,0,0,47.52-9.26A104.21,104.21,0,0,0,104.08,165.48Z" style="fill:url(#linear-gradient-3)" />
28 | </svg>
29 | 


--------------------------------------------------------------------------------
/notebooks/_img/lightgbm.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 3 | <!-- Creator: CorelDRAW 2020 (64-Bit Evaluation Version) -->
 4 | <svg xmlns="http://www.w3.org/2000/svg" xml:space="preserve" width="4672px" height="1058px" version="1.1" style="shape-rendering:geometricPrecision; text-rendering:geometricPrecision; image-rendering:optimizeQuality; fill-rule:evenodd; clip-rule:evenodd"
 5 | viewBox="0 0 4645.44 1052.32"
 6 |  xmlns:xlink="http://www.w3.org/1999/xlink"
 7 |  xmlns:xodm="http://www.corel.com/coreldraw/odm/2003">
 8 |  <defs>
 9 |   <style type="text/css">
10 |    <![CDATA[
11 |     .fil0 {fill:none}
12 |     .fil5 {fill:#1B9AD7}
13 |     .fil2 {fill:#76B644}
14 |     .fil1 {fill:#EF4927}
15 |     .fil4 {fill:#FCB518}
16 |     .fil3 {fill:#4B4B4D;fill-rule:nonzero}
17 |    ]]>
18 |   </style>
19 |  </defs>
20 |  <g id="图层_x0020_1">
21 |   <metadata id="CorelCorpID_0Corel-Layer"/>
22 |   <rect class="fil0" width="4645.44" height="1052.32"/>
23 |   <polygon class="fil1" points="629.73,0.27 0,0.27 0,403.33 "/>
24 |   <polygon class="fil2" points="262.04,431.06 629.75,192.72 629.75,503.23 "/>
25 |   <g>
26 |    <path class="fil3" d="M4645.44 837.58l-103.46 0 4.34 -529.43c-6.44,27.4 -12.13,47.17 -17.07,59.14l-187.75 470.29 -71.87 0 -188.21 -466.69c-5.24,-13.78 -10.93,-34.74 -16.77,-62.74l-1.65 0c2.25,25.16 3.45,66.18 3.45,123.23l0 406.2 -96.42 0 0 -628.99 146.88 0 165.29 418.93c12.58,32.19 20.82,56.15 24.71,71.87l2.1 0c10.93,-33.09 19.61,-57.65 26.34,-73.67l168.45 -417.13 141.64 0 0 628.99zm-375.81 0l-253.66 -628.99 100.94 0 192.1 490.8 -39.38 138.19zm322.98 -628.99l-88.81 0 88.81 0zm0 0l-251.11 628.99 -33.01 -139.52 195.31 -489.47 88.81 0z"/>
27 |   </g>
28 |   <path class="fil3" d="M3449.37 834.75l0 -625.48 198.47 0c60.45,0 108.4,13.25 143.68,39.6 35.29,26.51 53.01,61.05 53.01,103.48 0,35.44 -10.13,66.26 -30.08,92.46 -20.1,26.21 -47.94,44.82 -83.38,55.84l0 1.78c43.03,4.92 77.42,20.85 103.18,47.65 25.76,26.95 38.57,61.94 38.57,104.96 0,53.46 -21,96.78 -63.13,129.99 -42.29,33.2 -95.44,49.72 -159.76,49.72l-200.56 0zm103.39 -534.58l0.2 136.51c0.04,24.18 -2.1,34.43 28.69,34.19l38.2 -0.3c36.03,-0.28 64.32,-8.49 84.87,-25.61 20.55,-16.97 30.67,-41.09 30.67,-72.21 0,-53.45 -35.73,-80.25 -107.2,-80.25l-66.2 0c-6.79,0 -9.25,0.65 -9.23,7.67zm-0.06 264.61l0 154.26c0,26.53 -0.42,32.78 28.85,32.78l59.74 0c38.56,0 68.49,-8.93 89.63,-26.8 21,-17.86 31.57,-42.43 31.57,-73.85 0,-64.91 -44.97,-97.37 -134.75,-97.37l-63.23 0c-8.81,0 -11.81,2.15 -11.81,10.98z"/>
29 |   <path class="fil3" d="M3351.47 783.93c-63.06,34.27 -133.19,51.4 -210.51,51.4 -89.42,0 -161.85,-27.79 -217.29,-83.51 -55.43,-55.58 -83.08,-129.31 -83.08,-221.03 0,-93.59 30.24,-170.48 90.86,-230.53 60.62,-60.04 137.66,-90.14 230.96,-90.14 63.51,0 117.52,8.86 161.89,26.47 8.49,3.37 7.73,1.48 7.73,10.2l0 96.24c-46.36,-30.82 -101.65,-46.37 -165.73,-46.37 -64.51,0 -117.21,21.31 -158.39,63.65 -41.18,42.47 -61.92,97.48 -61.92,165.01 0,69.4 17.86,124.12 53.28,163.86 35.42,39.89 83.52,59.76 144.28,59.76 36.83,0 69.27,-6.32 97.3,-18.85 13.55,-6.06 10.69,-4.56 10.69,-19.42l0 -118.76c0,-9.58 -4.78,-10.86 -13.69,-10.86l-113.74 0 0 -66.84c0,-14.7 3.11,-17.54 18.89,-17.54l208.47 0 0 287.26z"/>
30 |   <path class="fil3" d="M2773.73 821.77c-19.51,9.22 -45.03,13.83 -76.43,13.83 -87.14,0 -130.71,-41.77 -130.71,-125.47l0 -253.93 -37.51 0 0 -79.81 37.51 0 -0.08 -113.52c0,-7.1 2.14,-11.35 9.03,-14.4l92.74 -41.07 0.12 168.99 107.36 0 0 79.81 -107.36 0 0 224.59c0,26.65 4.79,45.66 14.37,57.04 9.74,11.38 25.76,17.07 48.37,17.07 17.22,0 32.04,-4.94 44.62,-14.82l0 78.4c0,2.65 0.39,2.15 -2.03,3.29z"/>
31 |   <path class="fil3" d="M2464.86 837.58l-101.66 0 0 -245.7c0,-88.78 -29.8,-133.25 -89.09,-133.25 -29.79,0 -54.95,12.87 -75.46,38.63 -20.51,25.6 -30.69,58.54 -30.69,98.67l0 241.65 -102.27 0 0 -624.07c0,-5.64 3.31,-4.95 8.1,-4.95l90.21 0c2.37,0 3.96,-0.37 3.96,2.55l0 251.86 1.79 0c33.84,-56.75 82.5,-85.05 145.54,-85.05 99.71,0 149.57,60.94 149.57,182.82l0 276.84z"/>
32 |   <path class="fil3" d="M2002.55 789.04c0,164.85 -82.95,247.35 -248.69,247.35 -58.55,0 -109.6,-9.73 -153.17,-29.35l83.52 -58.21c18.34,7.63 38.08,7.31 56.47,7.31 106.76,0 160.06,-52.56 160.06,-157.52l0 -49.11 -1.8 0c-33.54,57.35 -84.14,85.95 -151.67,85.95 -54.65,0 -98.82,-19.92 -132.36,-59.89 -33.39,-39.83 -50.16,-93.43 -50.16,-160.66 0,-76.36 17.97,-137 53.9,-182.07 36.09,-45.06 85.5,-67.52 148.38,-67.52 59.29,0 103.31,24.25 131.91,72.76l1.8 0 0 -49.59c0,-10.62 1.64,-12.69 12.25,-12.69l76.07 0c12.33,0 13.49,-0.94 13.49,11.4l0 401.84zm-100.91 -169.79l0 -58.39c0,-31.59 -10.48,-58.54 -31.3,-80.85 -20.96,-22.46 -47.01,-33.54 -78.3,-33.54 -38.63,0 -68.88,14.22 -90.59,42.67 -21.86,28.6 -32.79,68.43 -32.79,119.63 0,44.17 10.48,79.36 31.45,105.86 20.81,26.5 48.66,39.67 83.09,39.67 35.04,0 63.64,-12.57 85.5,-37.88 22.01,-25.3 32.94,-57.64 32.94,-97.17z"/>
33 |   <path class="fil3" d="M1499.16 837.58l-101.81 0 0 -441.66c0,-5.84 1.24,-7.51 7.73,-7.51l84.14 0c8.97,0 9.94,-0.28 9.94,8.05l0 441.12z"/>
34 |   <path class="fil3" d="M1447.36 304.51c-13.77,0 -25.56,-4.47 -35.36,-13.4 -9.93,-9.06 -14.77,-20.35 -14.77,-34.25 0,-13.77 4.84,-25.31 14.77,-34.49 9.8,-9.18 21.59,-13.77 35.36,-13.77 14.27,0 26.43,4.59 36.35,13.77 9.93,9.18 14.89,20.72 14.89,34.49 0,13.16 -4.96,24.32 -14.89,33.63 -9.92,9.31 -22.08,14.02 -36.35,14.02z"/>
35 |   <path class="fil3" d="M984.92 837.58l0 -622.86c0,-6.28 2.75,-6.13 8.19,-6.13l88.48 0c5.21,0 7.69,-0.29 7.69,5.73l0 531.65c0,4.06 1.5,3.42 4.8,3.42l230.79 0c4.41,0 8.31,-0.51 8.31,5.47l0 82.64 -348.26 0.08z"/>
36 |   <polygon class="fil4" points="0.02,1047.61 629.75,1047.61 629.75,644.55 "/>
37 |   <polygon class="fil5" points="367.7,618.99 0,857.33 0,546.82 "/>
38 |  </g>
39 | </svg>
40 | 


--------------------------------------------------------------------------------
/notebooks/demo-aws.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "tribal-xerox",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# LightGBM + Dask\n",
  9 |     "\n",
 10 |     "<table>\n",
 11 |     "    <tr>\n",
 12 |     "        <td>\n",
 13 |     "            <img src=\"./_img/lightgbm.svg\" width=\"300\">\n",
 14 |     "        </td>\n",
 15 |     "        <td>\n",
 16 |     "            <img src=\"./_img/dask-horizontal.svg\" width=\"300\">\n",
 17 |     "        </td>\n",
 18 |     "        <td>\n",
 19 |     "            <img src=\"./_img/aws.svg\" width=\"150\">\n",
 20 |     "        </td>\n",
 21 |     "    </tr>\n",
 22 |     "</table>\n",
 23 |     "\n",
 24 |     "This notebook shows how to use `lightgbm.dask` to train a LightGBM model on data stored as a [Dask Array](https://docs.dask.org/en/latest/array.html). It uses `FargateCluster` from [`dask-cloudprovider`](https://github.com/dask/dask-cloudprovider) to create a distributed cluster running on [AWS Fargate](https://aws.amazon.com/fargate/).\n",
 25 |     "\n",
 26 |     "To explore other topics in greater depth, see the other notebooks."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "amino-hunger",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "<hr>\n",
 35 |     "\n",
 36 |     "## Set up a Dask cluster on AWS Fargate\n",
 37 |     "\n",
 38 |     "Before running any of the code in the notebook, follow the instructions in [\"Test with a FargateCluster\"](../README.md##test-with-a-fargatecluster)."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "expanded-declaration",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import json\n",
 49 |     "import os\n",
 50 |     "\n",
 51 |     "with open(\"../ecr-details.json\", \"r\") as f:\n",
 52 |     "    ecr_details = json.loads(f.read())\n",
 53 |     "\n",
 54 |     "IMAGE_REPO = ecr_details[\"repository\"][\"repositoryUri\"]\n",
 55 |     "IMAGE_TAG = os.environ[\"IMAGE_TAG\"]\n",
 56 |     "IMAGE_URI = f\"{IMAGE_REPO}:{IMAGE_TAG}\"\n",
 57 |     "print(f\"scheduler and worker image: {IMAGE_URI}\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "id": "harmful-bosnia",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Before proceeding, set up your AWS credentials. If you're unsure how to do this, see [the AWS docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html).\n",
 66 |     "\n",
 67 |     "Next, determine the CPU architecture of the machine you're running on.\n",
 68 |     "This project builds single-architecture container images matching the host system, so it's important\n",
 69 |     "to use the same CPU architecture on AWS Fargate."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "4a10d61c-5251-46a7-9f16-bd6eef606a82",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "import platform\n",
 80 |     "\n",
 81 |     "if platform.machine().lower() in {\"aarch64\", \"arm64\"}:\n",
 82 |     "    cpu_architecture = \"ARM64\"\n",
 83 |     "else:\n",
 84 |     "    cpu_architecture = \"X86_64\""
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "complicated-little",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Create a cluster with 3 workers. See https://cloudprovider.dask.org/en/latest/aws.html#dask_cloudprovider.aws.FargateCluster for more options."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "respective-collect",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "from dask.distributed import Client\n",
103 |     "from dask_cloudprovider.aws import FargateCluster\n",
104 |     "\n",
105 |     "n_workers = 3\n",
106 |     "cluster = FargateCluster(\n",
107 |     "    image=IMAGE_URI,\n",
108 |     "    cpu_architecture=cpu_architecture,\n",
109 |     "    worker_cpu=512,\n",
110 |     "    worker_mem=4096,\n",
111 |     "    n_workers=n_workers,\n",
112 |     "    fargate_use_private_ip=False,\n",
113 |     "    scheduler_timeout=\"40 minutes\",\n",
114 |     ")\n",
115 |     "client = Client(cluster)\n",
116 |     "client.wait_for_workers(n_workers)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "raising-mauritius",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "print(f\"View the dashboard: {cluster.dashboard_link}\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "radical-composition",
132 |    "metadata": {},
133 |    "source": [
134 |     "Click the link above to view a diagnostic dashboard while you run the training code below."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "modified-lincoln",
140 |    "metadata": {},
141 |    "source": [
142 |     "<hr>\n",
143 |     "\n",
144 |     "## Get some training data\n",
145 |     "\n",
146 |     "This example uses `sklearn.datasets.make_regression()` to generate a dataset in `numpy` format, then uses `dask.Array.from_array()` to turn that into a Dask Array.\n",
147 |     "\n",
148 |     "That's just done for convenience. `lightgbm.dask` just expects that your data are Dask Arrays or Dask DataFrames."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "structural-street",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "import dask.array as da\n",
159 |     "from sklearn.datasets import make_regression\n",
160 |     "\n",
161 |     "X, y = make_regression(n_samples=10000, random_state=42)\n",
162 |     "dX = da.from_array(X, chunks=(1000, X.shape[1]))\n",
163 |     "dy = da.from_array(y, chunks=1000)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "id": "unavailable-future",
169 |    "metadata": {},
170 |    "source": [
171 |     "Right now, the Dask Arrays `data` and `labels` are lazy. Before training, you can force the cluster to compute them by running `.persist()` and then wait for that computation to finish by `wait()`-ing on them.\n",
172 |     "\n",
173 |     "Doing this is optional, but it will make data loading a one-time cost so subsequent runs are fast."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "id": "quiet-nicaragua",
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "from dask.distributed import wait\n",
184 |     "\n",
185 |     "dX = dX.persist()\n",
186 |     "dy = dy.persist()\n",
187 |     "_ = wait([dX, dy])"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "id": "acoustic-corner",
193 |    "metadata": {},
194 |    "source": [
195 |     "<hr>\n",
196 |     "\n",
197 |     "## Train a model\n",
198 |     "\n",
199 |     "With the data set up on the workers, train a model. `lightgbm.dask.DaskLGBMRegressor` has an interface that tries to stay as close as possible to the non-Dask scikit-learn interface to LightGBM (`lightgbm.sklearn.LGBMRegressor`)."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "id": "pleased-brunei",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "from lightgbm.dask import DaskLGBMRegressor\n",
210 |     "\n",
211 |     "dask_reg = DaskLGBMRegressor(\n",
212 |     "    client=client,\n",
213 |     "    max_depth=5,\n",
214 |     "    objective=\"regression_l1\",\n",
215 |     "    learning_rate=0.1,\n",
216 |     "    tree_learner=\"data\",\n",
217 |     "    n_estimators=100,\n",
218 |     "    min_child_samples=1,\n",
219 |     ")\n",
220 |     "\n",
221 |     "dask_reg.fit(\n",
222 |     "    X=dX,\n",
223 |     "    y=dy,\n",
224 |     ")"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "id": "designed-kidney",
230 |    "metadata": {},
231 |    "source": [
232 |     "<hr>\n",
233 |     "\n",
234 |     "## Evaluate the model\n",
235 |     "\n",
236 |     "The `.predict()` method takes in a Dask collection and returns a Dask Array."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "id": "flexible-constitutional",
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "preds = dask_reg.predict(dX)\n",
247 |     "print(str(preds))"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "id": "funny-trademark",
253 |    "metadata": {},
254 |    "source": [
255 |     "Before calculating the mean absolute error (MAE) of these predictions, compute some summary statistics on the target variable. This is necessary to understand what \"good\" values of MAE look like."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "id": "cross-mistake",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "p = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]\n",
266 |     "dy_percentiles = da.percentile(dy, p).compute()\n",
267 |     "\n",
268 |     "for i, percentile in enumerate(p):\n",
269 |     "    print(f\"{percentile * 100}%: {round(dy_percentiles[i], 2)}\")"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "id": "offensive-switch",
275 |    "metadata": {},
276 |    "source": [
277 |     "The metrics functions from `dask-ml` match those from `scikit-learn`, but take in and return Dask collections. You can use these functions to perform model evaluation without the evaluation data or predictions needing to be pulled down to the machine running this notebook. Pretty cool, right?"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "id": "hybrid-greece",
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "from dask_ml.metrics.regression import mean_absolute_error\n",
288 |     "\n",
289 |     "mean_absolute_error(preds, dy)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "id": "outer-region",
295 |    "metadata": {},
296 |    "source": [
297 |     "## Next Steps\n",
298 |     "\n",
299 |     "Learn more: https://lightgbm.readthedocs.io/en/latest/Python-API.html#dask-api.\n",
300 |     "\n",
301 |     "Ask a question, report a bug, or submit a feature request: https://github.com/microsoft/LightGBM/issues.\n",
302 |     "\n",
303 |     "Contribute: https://github.com/microsoft/LightGBM/issues?q=is%3Aissue+is%3Aopen+label%3Adask."
304 |    ]
305 |   }
306 |  ],
307 |  "metadata": {
308 |   "kernelspec": {
309 |    "display_name": "Python 3 (ipykernel)",
310 |    "language": "python",
311 |    "name": "python3"
312 |   },
313 |   "language_info": {
314 |    "codemirror_mode": {
315 |     "name": "ipython",
316 |     "version": 3
317 |    },
318 |    "file_extension": ".py",
319 |    "mimetype": "text/x-python",
320 |    "name": "python",
321 |    "nbconvert_exporter": "python",
322 |    "pygments_lexer": "ipython3",
323 |    "version": "3.12.8"
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 5
328 | }
329 | 


--------------------------------------------------------------------------------
/notebooks/demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "noticed-account",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# LightGBM + Dask\n",
  9 |     "\n",
 10 |     "<table>\n",
 11 |     "    <tr>\n",
 12 |     "        <td>\n",
 13 |     "            <img src=\"./_img/lightgbm.svg\" width=\"300\">\n",
 14 |     "        </td>\n",
 15 |     "        <td>\n",
 16 |     "            <img src=\"./_img/dask-horizontal.svg\" width=\"300\">\n",
 17 |     "        </td>\n",
 18 |     "    </tr>\n",
 19 |     "</table>\n",
 20 |     "\n",
 21 |     "This notebook shows how to use `lightgbm.dask` to train a LightGBM model on data stored as a [Dask Array](https://docs.dask.org/en/latest/array.html).\n",
 22 |     "\n",
 23 |     "To explore other topics in greater depth, see the other notebooks."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "surprising-incentive",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "<hr>\n",
 32 |     "\n",
 33 |     "## Set up a local Dask cluster\n",
 34 |     "\n",
 35 |     "Create a cluster with 3 workers. Since this is a `LocalCluster`, those workers are just 3 local processes."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "dietary-multimedia",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from dask.distributed import Client, LocalCluster\n",
 46 |     "\n",
 47 |     "n_workers = 3\n",
 48 |     "cluster = LocalCluster(n_workers=n_workers)\n",
 49 |     "\n",
 50 |     "client = Client(cluster)\n",
 51 |     "client.wait_for_workers(n_workers)\n",
 52 |     "\n",
 53 |     "print(f\"View the dashboard: {cluster.dashboard_link}\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "coated-paper",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "Click the link above to view a diagnostic dashboard while you run the training code below."
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "id": "confirmed-recommendation",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "<hr>\n",
 70 |     "\n",
 71 |     "## Get some training data\n",
 72 |     "\n",
 73 |     "This example uses `sklearn.datasets.make_regression()` to generate a dataset in `numpy` format, then uses `dask.Array.from_array()` to turn that into a Dask Array.\n",
 74 |     "\n",
 75 |     "That's just done for convenience. `lightgbm.dask` just expects that your data are Dask Arrays or Dask DataFrames."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "billion-password",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "import dask.array as da\n",
 86 |     "from sklearn.datasets import make_regression\n",
 87 |     "\n",
 88 |     "X, y = make_regression(n_samples=10000, random_state=42)\n",
 89 |     "dX = da.from_array(X, chunks=(1000, X.shape[1]))\n",
 90 |     "dy = da.from_array(y, chunks=1000)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "temporal-terrace",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Right now, the Dask Arrays `data` and `labels` are lazy. Before training, you can force the cluster to compute them by running `.persist()` and then wait for that computation to finish by `wait()`-ing on them.\n",
 99 |     "\n",
100 |     "Doing this is optional, but it will make data loading a one-time cost so subsequent runs are fast."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "metallic-attachment",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "from dask.distributed import wait\n",
111 |     "\n",
112 |     "dX = dX.persist()\n",
113 |     "dy = dy.persist()\n",
114 |     "_ = wait([dX, dy])"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "id": "interpreted-central",
120 |    "metadata": {},
121 |    "source": [
122 |     "<hr>\n",
123 |     "\n",
124 |     "## Train a model\n",
125 |     "\n",
126 |     "With the data set up on the workers, train a model. `lightgbm.dask.DaskLGBMRegressor` has an interface that tries to stay as close as possible to the non-Dask scikit-learn interface to LightGBM (`lightgbm.sklearn.LGBMRegressor`)."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "id": "animated-magnitude",
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "from lightgbm.dask import DaskLGBMRegressor\n",
137 |     "\n",
138 |     "dask_reg = DaskLGBMRegressor(\n",
139 |     "    client=client,\n",
140 |     "    max_depth=5,\n",
141 |     "    objective=\"regression_l1\",\n",
142 |     "    learning_rate=0.1,\n",
143 |     "    tree_learner=\"data\",\n",
144 |     "    n_estimators=100,\n",
145 |     "    min_child_samples=1,\n",
146 |     ")\n",
147 |     "\n",
148 |     "dask_reg.fit(\n",
149 |     "    X=dX,\n",
150 |     "    y=dy,\n",
151 |     ")"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "id": "processed-karen",
157 |    "metadata": {},
158 |    "source": [
159 |     "<hr>\n",
160 |     "\n",
161 |     "## Evaluate the model\n",
162 |     "\n",
163 |     "The `.predict()` method takes in a Dask collection and returns a Dask Array."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "logical-handbook",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "preds = dask_reg.predict(dX)\n",
174 |     "print(str(preds))"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "prerequisite-symposium",
180 |    "metadata": {},
181 |    "source": [
182 |     "Before calculating the mean absolute error (MAE) of these predictions, compute some summary statistics on the target variable. This is necessary to understand what \"good\" values of MAE look like."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "id": "peaceful-damages",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "p = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]\n",
193 |     "dy_percentiles = da.percentile(dy, p).compute()\n",
194 |     "\n",
195 |     "for i, percentile in enumerate(p):\n",
196 |     "    print(f\"{percentile * 100}%: {round(dy_percentiles[i], 2)}\")"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "id": "romantic-clone",
202 |    "metadata": {},
203 |    "source": [
204 |     "The metrics functions from `dask-ml` match those from `scikit-learn`, but take in and return Dask collections. You can use these functions to perform model evaluation without the evaluation data or predictions needing to be pulled down to the machine running this notebook. Pretty cool, right?"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "id": "considered-holocaust",
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "from dask_ml.metrics.regression import mean_absolute_error\n",
215 |     "\n",
216 |     "mean_absolute_error(preds, dy)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "id": "reduced-teddy",
222 |    "metadata": {},
223 |    "source": [
224 |     "<hr>\n",
225 |     "\n",
226 |     "## Next Steps\n",
227 |     "\n",
228 |     "Learn more: https://lightgbm.readthedocs.io/en/latest/Python-API.html#dask-api.\n",
229 |     "\n",
230 |     "Ask a question, report a bug, or submit a feature request: https://github.com/microsoft/LightGBM/issues.\n",
231 |     "\n",
232 |     "Contribute: https://github.com/microsoft/LightGBM/issues?q=is%3Aissue+is%3Aopen+label%3Adask."
233 |    ]
234 |   }
235 |  ],
236 |  "metadata": {
237 |   "kernelspec": {
238 |    "display_name": "Python 3 (ipykernel)",
239 |    "language": "python",
240 |    "name": "python3"
241 |   },
242 |   "language_info": {
243 |    "codemirror_mode": {
244 |     "name": "ipython",
245 |     "version": 3
246 |    },
247 |    "file_extension": ".py",
248 |    "mimetype": "text/x-python",
249 |    "name": "python",
250 |    "nbconvert_exporter": "python",
251 |    "pygments_lexer": "ipython3",
252 |    "version": "3.11.4"
253 |   }
254 |  },
255 |  "nbformat": 4,
256 |  "nbformat_minor": 5
257 | }
258 | 


--------------------------------------------------------------------------------
/notebooks/testing/ranker-local.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Dask LightGBMRanker\n",
  8 |     "\n",
  9 |     "This notebook tests `lightgbm.dask.LGBMRanker`, proposed in https://github.com/microsoft/LightGBM/pull/3708."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import itertools\n",
 19 |     "\n",
 20 |     "import dask.array as da\n",
 21 |     "import dask.dataframe as dd\n",
 22 |     "import numpy as np\n",
 23 |     "import pandas as pd\n",
 24 |     "from dask.distributed import Client, LocalCluster\n",
 25 |     "from lightgbm.dask import DaskLGBMRanker\n",
 26 |     "from lightgbm.sklearn import LGBMRanker\n",
 27 |     "from scipy.stats import spearmanr\n",
 28 |     "from sklearn.utils import check_random_state"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "n_workers = 4\n",
 38 |     "cluster = LocalCluster(n_workers=n_workers)\n",
 39 |     "client = Client(cluster)\n",
 40 |     "client.wait_for_workers(n_workers)\n",
 41 |     "\n",
 42 |     "print(f\"View the dashboard: {cluster.dashboard_link}\")"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "def _make_ranking(\n",
 52 |     "    n_samples=100,\n",
 53 |     "    n_features=20,\n",
 54 |     "    n_informative=5,\n",
 55 |     "    gmax=1,\n",
 56 |     "    random_gs=False,\n",
 57 |     "    avg_gs=10,\n",
 58 |     "    random_state=0,\n",
 59 |     "):\n",
 60 |     "    \"\"\"\n",
 61 |     "    Generate a learning-to-rank dataset - feature vectors grouped\n",
 62 |     "    together with integer-valued graded relevance scores. Replace this\n",
 63 |     "    with a sklearn.datasets function if ranking objective becomes\n",
 64 |     "    supported in sklearn.datasets module.\n",
 65 |     "    \"\"\"\n",
 66 |     "    rnd_generator = check_random_state(random_state)\n",
 67 |     "\n",
 68 |     "    y_vec, group_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)\n",
 69 |     "    gid = 0\n",
 70 |     "\n",
 71 |     "    # build target, group ID vectors.\n",
 72 |     "    relvalues = range(gmax + 1)\n",
 73 |     "    while len(y_vec) < n_samples:\n",
 74 |     "        gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)\n",
 75 |     "        if not gsize:\n",
 76 |     "            continue\n",
 77 |     "\n",
 78 |     "        rel = rnd_generator.choice(relvalues, size=gsize, replace=True)\n",
 79 |     "        y_vec = np.append(y_vec, rel)\n",
 80 |     "        group_vec = np.append(group_vec, [gid] * gsize)\n",
 81 |     "        gid += 1\n",
 82 |     "\n",
 83 |     "    y_vec, group_vec = y_vec[0:n_samples], group_vec[0:n_samples]\n",
 84 |     "\n",
 85 |     "    # build feature data, X. Transform first few into informative features.\n",
 86 |     "    n_informative = max(min(n_features, n_informative), 0)\n",
 87 |     "    x_grid = np.linspace(0, stop=1, num=gmax + 2)\n",
 88 |     "    X = rnd_generator.uniform(size=(n_samples, n_features))\n",
 89 |     "\n",
 90 |     "    # make first n_informative features values\n",
 91 |     "    # bucketed according to relevance scores.\n",
 92 |     "    def bucket_fn(z):\n",
 93 |     "        return rnd_generator.uniform(x_grid[z], high=x_grid[z + 1])\n",
 94 |     "\n",
 95 |     "    for j in range(n_informative):\n",
 96 |     "        bias, coef = rnd_generator.normal(size=2)\n",
 97 |     "        X[:, j] = bias + coef * np.apply_along_axis(bucket_fn, axis=0, arr=y_vec)\n",
 98 |     "\n",
 99 |     "    return X, y_vec, group_vec\n",
100 |     "\n",
101 |     "\n",
102 |     "def _create_ranking_data(n_samples=100, output=\"array\", chunk_size=50):\n",
103 |     "    X, y, g = _make_ranking(n_samples=n_samples, random_state=42)\n",
104 |     "    rnd = np.random.RandomState(42)\n",
105 |     "    w = rnd.rand(X.shape[0]) * 0.01\n",
106 |     "    g_rle = np.array([sum([1 for _ in grp]) for _, grp in itertools.groupby(g)])\n",
107 |     "\n",
108 |     "    if output == \"dataframe\":\n",
109 |     "        # add target, weight, and group to DataFrame so that\n",
110 |     "        # partitions abide by group boundaries.\n",
111 |     "        X_df = pd.DataFrame(X, columns=[f\"feature_{i}\" for i in range(X.shape[1])])\n",
112 |     "        X = X_df.copy()\n",
113 |     "        X_df = X_df.assign(y=y, g=g, w=w)\n",
114 |     "\n",
115 |     "        # set_index ensures partitions are based on group id.\n",
116 |     "        # See https://bit.ly/3pAWyNw.\n",
117 |     "        X_df.set_index(\"g\", inplace=True)\n",
118 |     "        dX = dd.from_pandas(X_df, chunksize=chunk_size)\n",
119 |     "\n",
120 |     "        # separate target, weight from features.\n",
121 |     "        dy = dX[\"y\"]\n",
122 |     "        dw = dX[\"w\"]\n",
123 |     "        dX = dX.drop(columns=[\"y\", \"w\"])\n",
124 |     "        dg = dX.index.to_series()\n",
125 |     "\n",
126 |     "        # encode group identifiers into run-length encoding,\n",
127 |     "        # the format LightGBMRanker is expecting\n",
128 |     "        # so that within each partition, sum(g) = n_samples.\n",
129 |     "        dg = dg.map_partitions(\n",
130 |     "            lambda p: p.groupby(\"g\", sort=False).apply(lambda z: z.shape[0])\n",
131 |     "        )\n",
132 |     "\n",
133 |     "    elif output == \"array\":\n",
134 |     "        # ranking arrays: one chunk per group.\n",
135 |     "        # Each chunk must include all columns.\n",
136 |     "        p = X.shape[1]\n",
137 |     "        dX, dy, dw, dg = list(), list(), list(), list()\n",
138 |     "        for g_idx, rhs in enumerate(np.cumsum(g_rle)):\n",
139 |     "            lhs = rhs - g_rle[g_idx]\n",
140 |     "            dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p)))\n",
141 |     "            dy.append(da.from_array(y[lhs:rhs]))\n",
142 |     "            dw.append(da.from_array(w[lhs:rhs]))\n",
143 |     "            dg.append(da.from_array(np.array([g_rle[g_idx]])))\n",
144 |     "\n",
145 |     "        dX = da.concatenate(dX, axis=0)\n",
146 |     "        dy = da.concatenate(dy, axis=0)\n",
147 |     "        dw = da.concatenate(dw, axis=0)\n",
148 |     "        dg = da.concatenate(dg, axis=0)\n",
149 |     "\n",
150 |     "    else:\n",
151 |     "        raise ValueError(\n",
152 |     "            \"ranking data creation only supported for Dask arrays and dataframes\"\n",
153 |     "        )\n",
154 |     "\n",
155 |     "    return X, y, w, g_rle, dX, dy, dw, dg"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## Test with Dask array"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output=\"array\")"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "dg.compute()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "dask_ranker = DaskLGBMRanker(time_out=5, seed=42, min_child_samples=1)\n",
190 |     "\n",
191 |     "dask_ranker = dask_ranker.fit(X=dX, y=dy, sample_weight=dw, group=dg)\n",
192 |     "rnkvec_dask = dask_ranker.predict(dX)\n",
193 |     "rnkvec_dask = rnkvec_dask.compute()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "rnkvec_dask"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "local_ranker = LGBMRanker(seed=42, min_child_samples=1)\n",
212 |     "local_ranker.fit(X, y, sample_weight=w, group=g)\n",
213 |     "rnkvec_local = local_ranker.predict(X)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "# distributed ranker should be able to rank decently well.\n",
223 |     "dcor = spearmanr(rnkvec_dask, y).correlation\n",
224 |     "assert dcor > 0.6\n",
225 |     "dcor"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# relative difference between distributed ranker\n",
235 |     "# and local ranker spearman corr should be small.\n",
236 |     "lcor = spearmanr(rnkvec_local, y).correlation\n",
237 |     "print(np.abs(dcor - lcor))\n",
238 |     "assert np.abs(dcor - lcor) < 0.003"
239 |    ]
240 |   }
241 |  ],
242 |  "metadata": {
243 |   "kernelspec": {
244 |    "display_name": "Python 3",
245 |    "language": "python",
246 |    "name": "python3"
247 |   },
248 |   "language_info": {
249 |    "codemirror_mode": {
250 |     "name": "ipython",
251 |     "version": 3
252 |    },
253 |    "file_extension": ".py",
254 |    "mimetype": "text/x-python",
255 |    "name": "python",
256 |    "nbconvert_exporter": "python",
257 |    "pygments_lexer": "ipython3",
258 |    "version": "3.8.6"
259 |   }
260 |  },
261 |  "nbformat": 4,
262 |  "nbformat_minor": 4
263 | }
264 | 


--------------------------------------------------------------------------------
/notebooks/testing/sparse-inputs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook tests `lightgbm.dask`'s behavior with sparse inputs to `pred_contrib()`."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import dask.array as da\n",
 17 |     "import numpy as np\n",
 18 |     "from dask.distributed import Client, LocalCluster\n",
 19 |     "from lightgbm.dask import DaskLGBMClassifier\n",
 20 |     "from lightgbm.sklearn import LGBMClassifier\n",
 21 |     "from scipy.sparse import csc_matrix\n",
 22 |     "from sklearn.datasets import make_blobs"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "n_workers = 3\n",
 32 |     "cluster = LocalCluster(n_workers=n_workers)\n",
 33 |     "client = Client(cluster)\n",
 34 |     "client.wait_for_workers(n_workers)\n",
 35 |     "\n",
 36 |     "print(f\"View the dashboard: {cluster.dashboard_link}\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "chunk_size = 50\n",
 46 |     "X, y = make_blobs(n_samples=100, centers=3, random_state=42)\n",
 47 |     "rnd = np.random.RandomState(42)\n",
 48 |     "dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix)\n",
 49 |     "dy = da.from_array(y, chunks=chunk_size)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "dask_clf = DaskLGBMClassifier(n_estimators=5, num_leaves=2, tree_learner=\"data\")\n",
 59 |     "dask_clf.fit(dX, dy)\n",
 60 |     "\n",
 61 |     "preds = dask_clf.predict(dX, pred_contrib=True)\n",
 62 |     "preds_computed = preds.compute()\n",
 63 |     "\n",
 64 |     "print(\n",
 65 |     "    type(preds),\n",
 66 |     "    type(preds.partitions[0].compute()),\n",
 67 |     "    type(preds_computed),\n",
 68 |     "    f\"{dask_clf.n_classes_} classes, {dX.shape[1]} features\",\n",
 69 |     ")\n",
 70 |     "print(\"---\")\n",
 71 |     "print(dX.partitions[0].compute())\n",
 72 |     "print(\"---\")\n",
 73 |     "preds.compute().shape"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "preds.partitions[0].compute()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "X = dX.compute()\n",
 92 |     "y = dy.compute()\n",
 93 |     "\n",
 94 |     "local_clf = LGBMClassifier()\n",
 95 |     "local_clf.fit(X=dX.compute(), y=y)\n",
 96 |     "local_preds = local_clf.predict(dX.compute().tocsc(), pred_contrib=True)\n",
 97 |     "\n",
 98 |     "print(local_clf.n_classes_, type(local_preds))\n",
 99 |     "print(\"---\")\n",
100 |     "print(local_preds)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "local_preds[0]"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "Python 3",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.8.6"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 4
141 | }
142 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff.lint]
 2 | select = [
 3 |     # flake8-bugbear
 4 |     "B",
 5 |     # flake8-comprehensions
 6 |     "C4",
 7 |     # pycodestyle
 8 |     "E",
 9 |     # pyflakes
10 |     "F",
11 |     # isort
12 |     "I",
13 |     # NumPy-specific rules
14 |     "NPY",
15 |     # pylint
16 |     "PL",
17 |     # flake8-return: unnecessary assignment before return
18 |     "RET504",
19 |     # flake8-simplify: use dict.get() instead of an if-else block
20 |     "SIM401",
21 | ]
22 | 
23 | [tool.ruff.lint.isort]
24 | 
25 | # prevent ruff from thinking that 'lightgbm.dask' imports should
26 | # come after all others
27 | known-third-party = [
28 |   "dask",
29 |   "dask_cloudprovider",
30 |   "lightgbm",
31 |   "pandas",
32 |   "scipy",
33 |   "sklearn",
34 | ]
35 | 
36 | [tool.ruff.lint.per-file-ignores]
37 | "*.ipynb" = [
38 |   # (pylint) Unnecessary list() call
39 |   "C408",
40 |   # (pylint) too many arguments in function definition
41 |   "PLR0913",
42 |   # (pylint) Magic value used in comparison
43 |   "PLR2004",
44 | ]
45 | "jupyter_notebook_config.py" = [
46 |     # (flake8) undefined name
47 |     "F821",
48 | ]
49 | 


--------------------------------------------------------------------------------