├── .dockerignore ├── .github ├── dependabot.yml ├── pull_request_template.md └── workflows │ └── main.yml ├── .gitignore ├── .hadolint.yaml ├── .pre-commit-config.yaml ├── .yamllint.yaml ├── Dockerfile-cluster ├── Dockerfile-cluster-base ├── Dockerfile-notebook ├── Dockerfile-notebook-base ├── Dockerfile-profiling ├── LICENSE ├── Makefile ├── README.md ├── bin ├── install-cmake ├── profile-example-memory-usage.sh └── profile-examples.sh ├── jupyter_notebook_config.py ├── notebooks ├── README.md ├── _img │ ├── aws.svg │ ├── dask-horizontal.svg │ └── lightgbm.svg ├── demo-aws.ipynb ├── demo.ipynb └── testing │ ├── ranker-local.ipynb │ └── sparse-inputs.ipynb └── pyproject.toml /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !bin/install-cmake 3 | !bin/profile-examples.sh 4 | !bin/profile-example-memory-usage.sh 5 | !jupyter_notebook_config.py 6 | !LightGBM/build-python.sh 7 | !LightGBM/cmake 8 | !LightGBM/CMakeLists.txt 9 | !LightGBM/external_libs 10 | !LightGBM/include 11 | !LightGBM/lib_lightgbm.so 12 | !LightGBM/LICENSE 13 | !LightGBM/python-package 14 | !LightGBM/src 15 | !LightGBM/swig 16 | !LightGBM/VERSION.txt 17 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: github-actions 5 | directory: / 6 | schedule: 7 | interval: monthly 8 | # group updates in a single PR 9 | groups: 10 | ci-dependencies: 11 | patterns: 12 | - "*" 13 | commit-message: 14 | prefix: "[ci]" 15 | labels: 16 | - maintenance 17 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | ## Benefits of this work 4 | 5 | ## Notes for Reviewers 6 | 7 | ### How I tested this 8 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | # always run CI on new commits to any branch 4 | on: push 5 | 6 | jobs: 7 | lint: 8 | name: lint 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 12 | - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 13 | build: 14 | name: build 15 | needs: [lint] 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 20 | - name: Set up Docker Buildx 21 | uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.1.0 22 | with: 23 | # 'driver: docker' allows one build to reuse images from a prior build 24 | # ref: https://github.com/docker/setup-buildx-action/issues/251 25 | driver: docker 26 | install: true 27 | use: true 28 | - name: Build notebook image 29 | run: | 30 | make notebook-image 31 | - name: Build cluster image 32 | run: | 33 | make cluster-image 34 | - name: Build profiling image 35 | run: | 36 | make profiling-image 37 | all-tests-successful: 38 | if: always() 39 | runs-on: ubuntu-latest 40 | needs: 41 | - build 42 | - lint 43 | steps: 44 | - name: Decide whether the needed jobs succeeded or failed 45 | uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # v1.2.2 46 | with: 47 | jobs: ${{ toJSON(needs) }} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.7z 2 | *.a 3 | *.bin 4 | *.buffer 5 | *.bzip 6 | *.core 7 | *.csv 8 | dask-worker-space/ 9 | *.db 10 | *.dll 11 | *.doc 12 | *.docm 13 | *.docx 14 | .DS_Store 15 | *.egg-info/ 16 | *.env 17 | *.exe 18 | .idea/ 19 | .ipynb_checkpoints/ 20 | *.json 21 | LightGBM/ 22 | .mypy_cache/ 23 | *.npy 24 | *.o 25 | *.parquet 26 | *.pem 27 | *.pkl 28 | *.ppt 29 | *.pptm 30 | *.pptx 31 | profiling-output/ 32 | *.pq 33 | *.pyc 34 | __pycache/ 35 | *.query 36 | *.rsa 37 | .ruff_cache/ 38 | *.so 39 | *.sqlite 40 | *.tar.gz 41 | *.tgz 42 | *.text 43 | *.train 44 | *.txt 45 | Untitled*.ipynb 46 | *.whl 47 | *.xls 48 | *.xlsm 49 | *.xlsx 50 | *.zip 51 | -------------------------------------------------------------------------------- /.hadolint.yaml: -------------------------------------------------------------------------------- 1 | ignored: 2 | - DL3003 # use WORKDIR instead of cd 3 | - DL3007 # do not use latest 4 | - DL3008 # pin versions in apt 5 | - DL3013 # pin versions in pip 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | exclude: | 3 | (?x)^( 4 | LightGBM 5 | )$ 6 | 7 | repos: 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: v5.0.0 10 | hooks: 11 | - id: check-toml 12 | - id: end-of-file-fixer 13 | - id: trailing-whitespace 14 | - repo: https://github.com/pre-commit/mirrors-mypy 15 | rev: v1.15.0 16 | hooks: 17 | - id: mypy 18 | args: ["--config-file", "pyproject.toml"] 19 | exclude: "tests" 20 | additional_dependencies: 21 | - types-requests 22 | - repo: https://github.com/astral-sh/ruff-pre-commit 23 | # Ruff version. 24 | rev: v0.11.6 25 | hooks: 26 | # Run the linter. 27 | - id: ruff 28 | args: ["--config", "pyproject.toml", "--fix"] 29 | types_or: [jupyter, python] 30 | # Run the formatter. 31 | - id: ruff-format 32 | args: ["--config", "pyproject.toml"] 33 | types_or: [python, jupyter] 34 | - repo: https://github.com/maxwinterstein/shfmt-py 35 | rev: v3.11.0.2 36 | hooks: 37 | - id: shfmt 38 | args: ["--indent=4", "--space-redirects", "--write"] 39 | - repo: https://github.com/shellcheck-py/shellcheck-py 40 | rev: v0.10.0.1 41 | hooks: 42 | - id: shellcheck 43 | args: ["--exclude=SC2002"] 44 | - repo: https://github.com/adrienverge/yamllint 45 | rev: v1.37.0 46 | hooks: 47 | - id: yamllint 48 | - repo: https://github.com/codespell-project/codespell 49 | rev: v2.4.1 50 | hooks: 51 | - id: codespell 52 | additional_dependencies: [tomli] 53 | args: ["--toml", "pyproject.toml"] 54 | -------------------------------------------------------------------------------- /.yamllint.yaml: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | rules: 4 | anchors: 5 | forbid-undeclared-aliases: true 6 | forbid-duplicated-anchors: true 7 | forbid-unused-anchors: true 8 | braces: 9 | forbid: false 10 | min-spaces-inside: 0 11 | # allow 1 space for jinja templating in conda recipes 12 | max-spaces-inside: 1 13 | min-spaces-inside-empty: -1 14 | max-spaces-inside-empty: -1 15 | document-start: disable 16 | line-length: 17 | max: 120 18 | truthy: 19 | allowed-values: ['false', 'true'] 20 | # having problematic value in keys is rare... and also 21 | # GitHub Actions' choie of 'on:' triggers this check 22 | # ref: https://github.com/adrienverge/yamllint/issues/430 23 | check-keys: false 24 | -------------------------------------------------------------------------------- /Dockerfile-cluster: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=unset 2 | 3 | # hadolint ignore=DL3006 4 | FROM ${BASE_IMAGE} 5 | 6 | RUN --mount=type=bind,source=LightGBM,target=/tmp/LightGBM,rw \ 7 | <=2024.4.4' \ 31 | "distributed==${DASK_VERSION}" \ 32 | lz4 \ 33 | numpy \ 34 | 'pandas>=2.0.0' \ 35 | scikit-learn 36 | 37 | # remove unnecessary files 38 | find \ 39 | /usr/local/lib/python${PYTHON_VERSION}/site-packages \ 40 | -type f \ 41 | \( \ 42 | -name '*.c' \ 43 | -o -name '*.cc' \ 44 | -o -name '*.cpp' \ 45 | -o -name '*.h' \ 46 | -o -name '*.hpp' \ 47 | -o -wholename '*bokeh/sampledata/*' \ 48 | -o -wholename '*dask/*tests/*' \ 49 | -o -wholename '*joblib/test/*' \ 50 | -o -wholename '*llvmlite/tests/*' \ 51 | -o -wholename '*numba/*tests/*' \ 52 | -o -wholename '*numpy/*tests/*' \ 53 | -o -wholename '*pandas/tests*' \ 54 | -o -wholename '*pandas/*/tests/*' \ 55 | -o -wholename '*psutil/tests/*' \ 56 | -o -wholename 'pyarrow/_pyarrow_cpp_tests*' \ 57 | -o -wholename '*scikit-learn/tests*' \ 58 | -o -wholename '*scikit-learn/*/tests*' \ 59 | -o -wholename '*sklearn/tests*' \ 60 | -o -wholename '*sklearn/*/tests*' \ 61 | -o -wholename '*scipy/*/tests*' \ 62 | -o -wholename '*sparse/*/tests/*' \ 63 | -o -wholename '*toolz/tests/*' \ 64 | -o -wholename '*tornado/test/*' \ 65 | -o -wholename '*zict/tests/*' \ 66 | -o -wholename '*/__pycache__/*' \ 67 | \) \ 68 | -exec rm '{}' '+' 69 | 70 | find \ 71 | /usr/local/lib/python${PYTHON_VERSION}/site-packages \ 72 | -type d \ 73 | -wholename '*__pycache__*' \ 74 | -exec rm -rf '{}' '+' 75 | 76 | # clean apt-get files 77 | apt-get clean 78 | apt-get purge -y --auto-remove 79 | rm -rf /var/lib/apt/lists/* 80 | 81 | # clean other files 82 | rm -rf ~/.cache 83 | EOF 84 | -------------------------------------------------------------------------------- /Dockerfile-notebook: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=unset 2 | 3 | # hadolint ignore=DL3006 4 | FROM ${BASE_IMAGE} 5 | 6 | COPY jupyter_notebook_config.py /root/.jupyter/jupyter_notebook_config.py 7 | 8 | RUN --mount=type=bind,source=LightGBM,target=/tmp/LightGBM,rw \ 9 | <=2.5.0' \ 29 | blosc \ 30 | bokeh \ 31 | "dask==${DASK_VERSION}" \ 32 | 'dask-cloudprovider[aws]>=2022.10.0' \ 33 | 'dask-ml>=2023.3.24' \ 34 | "distributed==${DASK_VERSION}" \ 35 | 'jupyterlab>=4.0.2' \ 36 | lz4 \ 37 | numpy \ 38 | 'pandas>=2.0.0' \ 39 | scikit-learn 40 | 41 | # remove unnecessary files 42 | find \ 43 | /usr/local/lib/python${PYTHON_VERSION}/site-packages \ 44 | -type f \ 45 | \( \ 46 | -name '*.c' \ 47 | -o -name '*.cc' \ 48 | -o -name '*.cpp' \ 49 | -o -name '*.h' \ 50 | -o -name '*.hpp' \ 51 | -o -wholename '*bokeh/sampledata/*' \ 52 | -o -wholename '*dask/*tests/*' \ 53 | -o -wholename '*joblib/test/*' \ 54 | -o -wholename '*llvmlite/tests/*' \ 55 | -o -wholename '*numba/*tests/*' \ 56 | -o -wholename '*numpy/*tests/*' \ 57 | -o -wholename '*pandas/tests*' \ 58 | -o -wholename '*pandas/*/tests/*' \ 59 | -o -wholename '*psutil/tests/*' \ 60 | -o -wholename 'pyarrow/_pyarrow_cpp_tests*' \ 61 | -o -wholename '*scikit-learn/tests*' \ 62 | -o -wholename '*scikit-learn/*/tests*' \ 63 | -o -wholename '*sklearn/tests*' \ 64 | -o -wholename '*sklearn/*/tests*' \ 65 | -o -wholename '*scipy/*/tests*' \ 66 | -o -wholename '*sparse/*/tests/*' \ 67 | -o -wholename '*toolz/tests/*' \ 68 | -o -wholename '*tornado/test/*' \ 69 | -o -wholename '*zict/tests/*' \ 70 | -o -wholename '*/__pycache__/*' \ 71 | \) \ 72 | -exec rm '{}' '+' 73 | 74 | find \ 75 | /usr/local/lib/python${PYTHON_VERSION}/site-packages \ 76 | -type d \ 77 | -wholename '*__pycache__*' \ 78 | -exec rm -rf '{}' '+' 79 | 80 | # clean apt-get files 81 | apt-get clean 82 | apt-get purge -y --auto-remove 83 | rm -rf /var/lib/apt/lists/* 84 | 85 | # clean other files 86 | rm -rf ~/.cache 87 | EOF 88 | 89 | ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--port=8888"] 90 | -------------------------------------------------------------------------------- /Dockerfile-profiling: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=unset 2 | 3 | # hadolint ignore=DL3006 4 | FROM ${BASE_IMAGE} 5 | 6 | RUN < /dev/null); then \ 34 | if test ${FORCE_REBUILD} -le 0; then \ 35 | echo "image '${CLUSTER_BASE_IMAGE}' already exists. To force rebuilding, run 'make cluster-base-image -e FORCE_REBUILD=1'."; \ 36 | exit 0; \ 37 | fi; \ 38 | fi; \ 39 | docker buildx build \ 40 | --build-arg DASK_VERSION=${DASK_VERSION} \ 41 | --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ 42 | --load \ 43 | --output type=docker \ 44 | -t ${CLUSTER_BASE_IMAGE} \ 45 | -f ./Dockerfile-cluster-base \ 46 | . 47 | echo "--- docker images ---" 48 | docker images 49 | 50 | .PHONY: cluster-image 51 | cluster-image: cluster-base-image $(LIB_LIGHTGBM) 52 | docker buildx build \ 53 | --build-arg BASE_IMAGE=${CLUSTER_BASE_IMAGE} \ 54 | --load \ 55 | --output type=docker \ 56 | -t ${CLUSTER_IMAGE} \ 57 | -f ./Dockerfile-cluster \ 58 | . 59 | 60 | .PHONY: create-repo 61 | create-repo: ecr-details.json 62 | 63 | .PHONY: delete-repo 64 | delete-repo: 65 | aws --region ${AWS_REGION} \ 66 | ecr-public batch-delete-image \ 67 | --repository-name ${CLUSTER_IMAGE_NAME} \ 68 | --image-ids imageTag=${IMAGE_TAG} 69 | aws --region ${AWS_REGION} \ 70 | ecr-public delete-repository \ 71 | --repository-name ${CLUSTER_IMAGE_NAME} 72 | rm -f ./ecr-details.json 73 | 74 | ecr-details.json: 75 | aws --region ${AWS_REGION} \ 76 | ecr-public create-repository \ 77 | --repository-name ${CLUSTER_IMAGE_NAME} \ 78 | > ./ecr-details.json 79 | 80 | $(LIGHTGBM_REPO): 81 | git clone --recursive https://github.com/microsoft/LightGBM.git 82 | 83 | $(LIB_LIGHTGBM): $(LIGHTGBM_REPO) 84 | make notebook-base-image 85 | docker run \ 86 | --rm \ 87 | -v $$(pwd)/LightGBM:/opt/LightGBM \ 88 | --workdir=/opt/LightGBM \ 89 | --entrypoint="" \ 90 | -i ${NOTEBOOK_BASE_IMAGE} \ 91 | /bin/bash -cex \ 92 | "rm -rf ./build && cmake -B build -S . && cmake --build build --target _lightgbm -j2" 93 | 94 | .PHONY: lightgbm-unit-tests 95 | lightgbm-unit-tests: 96 | docker run \ 97 | --rm \ 98 | -v $$(pwd)/LightGBM:/opt/LightGBM \ 99 | --workdir=/opt/LightGBM \ 100 | --entrypoint="" \ 101 | -i ${CLUSTER_IMAGE} \ 102 | /bin/bash -cex \ 103 | "sh ./build-python.sh install --precompile && pip install pytest && pytest -vv -rA tests/python_package_test/test_dask.py" 104 | 105 | .PHONY: lint-dockerfiles 106 | lint-dockerfiles: 107 | for dockerfile in $$(ls | grep -E '^Dockerfile'); do \ 108 | echo "linting $${dockerfile}" && \ 109 | docker run \ 110 | --rm \ 111 | -v $$(pwd)/.hadolint.yaml:/.config/hadolint.yaml \ 112 | -i \ 113 | hadolint/hadolint \ 114 | < $${dockerfile} || exit 1; \ 115 | done 116 | 117 | .PHONY: notebook-base-image 118 | notebook-base-image: 119 | @if $$(docker image inspect ${NOTEBOOK_BASE_IMAGE} > /dev/null); then \ 120 | if test ${FORCE_REBUILD} -le 0; then \ 121 | echo "image '${NOTEBOOK_BASE_IMAGE}' already exists. To force rebuilding, run 'make notebook-base-image -e FORCE_REBUILD=1'."; \ 122 | exit 0; \ 123 | fi; \ 124 | fi; \ 125 | docker buildx build \ 126 | --build-arg DASK_VERSION=${DASK_VERSION} \ 127 | --build-arg PYTHON_VERSION=${PYTHON_VERSION} \ 128 | --load \ 129 | --output type=docker \ 130 | -t ${NOTEBOOK_BASE_IMAGE} \ 131 | -f ./Dockerfile-notebook-base \ 132 | . 133 | 134 | .PHONY: notebook-image 135 | notebook-image: notebook-base-image $(LIB_LIGHTGBM) 136 | docker buildx build \ 137 | --build-arg BASE_IMAGE=${NOTEBOOK_BASE_IMAGE} \ 138 | --load \ 139 | --output type=docker \ 140 | -t ${NOTEBOOK_IMAGE} \ 141 | -f ./Dockerfile-notebook \ 142 | . 143 | 144 | .PHONY: profile 145 | profile: profiling-image 146 | docker run \ 147 | --rm \ 148 | -p 8080:8080 \ 149 | --env LIGHTGBM_HOME=/opt/LightGBM \ 150 | --env PROFILING_OUTPUT_DIR=/profiling-output \ 151 | -v $$(pwd)/profiling-output:/profiling-output \ 152 | -v $$(pwd)/LightGBM:/opt/LightGBM \ 153 | --workdir=/opt/LightGBM \ 154 | --entrypoint="" \ 155 | -i ${PROFILING_IMAGE} \ 156 | /bin/bash -cex \ 157 | '/bin/bash /usr/local/bin/profile-examples.sh && python -m snakeviz /profiling-output/ --hostname 0.0.0.0 --server' 158 | 159 | .PHONY: profiling-image 160 | profiling-image: cluster-image 161 | @if $$(docker image inspect ${PROFILING_IMAGE} > /dev/null); then \ 162 | if test ${FORCE_REBUILD_PROFILING_IMAGE} -le 0; then \ 163 | echo "image '${PROFILING_IMAGE}' already exists. To force rebuilding, run 'make profiling-image -e FORCE_REBUILD_PROFILING_IMAGE=1'."; \ 164 | exit 0; \ 165 | fi; \ 166 | fi && \ 167 | docker buildx build \ 168 | --build-arg BASE_IMAGE=${CLUSTER_IMAGE} \ 169 | --load \ 170 | --output type=docker \ 171 | -t ${PROFILING_IMAGE} \ 172 | -f ./Dockerfile-profiling \ 173 | . 174 | 175 | .PHONY: profile-memory-usage 176 | profile-memory-usage: profiling-image 177 | docker run \ 178 | --rm \ 179 | --env LIGHTGBM_HOME=/opt/LightGBM \ 180 | --env PROFILING_OUTPUT_DIR=/profiling-output/memory-usage \ 181 | -v $$(pwd)/profiling-output:/profiling-output \ 182 | -v $$(pwd)/LightGBM:/opt/LightGBM \ 183 | --workdir=/opt/LightGBM \ 184 | --entrypoint="" \ 185 | -i ${PROFILING_IMAGE} \ 186 | /bin/bash -cex \ 187 | '/bin/bash /usr/local/bin/profile-example-memory-usage.sh' 188 | 189 | # https://docs.amazonaws.cn/en_us/AmazonECR/latest/public/docker-push-ecr-image.html 190 | .PHONY: push-image 191 | push-image: create-repo 192 | aws ecr-public get-login-password \ 193 | --region ${AWS_REGION} \ 194 | | docker login \ 195 | --username AWS \ 196 | --password-stdin public.ecr.aws 197 | docker tag \ 198 | ${CLUSTER_IMAGE_NAME}:${IMAGE_TAG} \ 199 | $$(cat ./ecr-details.json | jq .'repository'.'repositoryUri' | tr -d '"'):${IMAGE_TAG} 200 | docker push \ 201 | $$(cat ./ecr-details.json | jq .'repository'.'repositoryUri' | tr -d '"'):${IMAGE_TAG} 202 | 203 | # NOTE: IMAGE_TAG is in the environment here so the AWS notebooks 204 | # know what image to use for the Dask cluster 205 | .PHONY: start-notebook 206 | start-notebook: 207 | docker run \ 208 | --rm \ 209 | -v $$(pwd):/root/testing \ 210 | --env AWS_ACCESS_KEY_ID=$${AWS_ACCESS_KEY_ID:-notset} \ 211 | --env AWS_DEFAULT_REGION=${AWS_REGION} \ 212 | --env AWS_SECRET_ACCESS_KEY=$${AWS_SECRET_ACCESS_KEY:-notset} \ 213 | --env IMAGE_TAG=${IMAGE_TAG} \ 214 | -p 8888:8888 \ 215 | -p 8787:8787 \ 216 | --name ${NOTEBOOK_CONTAINER_NAME} \ 217 | ${NOTEBOOK_IMAGE} 218 | 219 | .PHONY: stop-notebook 220 | stop-notebook: 221 | @docker kill ${NOTEBOOK_CONTAINER_NAME} 222 | @docker rm ${NOTEBOOK_CONTAINER_NAME} 223 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Testing `lightgbm.dask` 2 | 3 | [![GitHub Actions](https://github.com/jameslamb/lightgbm-dask-testing/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/jameslamb/lightgbm-dask-testing/actions/workflows/main.yml) 4 | 5 | This repository can be used to test and develop changes to LightGBM's Dask integration. 6 | It contains the following useful features: 7 | 8 | * `make` recipes for building a local development image with `lightgbm` installed from a local copy, and Jupyter Lab running for interactive development 9 | * Jupyter notebooks for testing `lightgbm.dask` against a `LocalCluster` (multi-worker, single-machine) and a `dask_cloudprovider.aws.FargateCluster` (multi-worker, multi-machine) 10 | * `make` recipes for publishing a custom container image to ECR Public repository, for use with AWS Fargate 11 | 12 |
13 | 14 | **Contents** 15 | 16 | - [Getting Started](#getting-started) 17 | - [Develop in Jupyter](#develop-in-jupyter) 18 | - [Test with a LocalCluster](#test-with-a-localcluster) 19 | - [Test with a FargateCluster](#test-with-a-fargatecluster) 20 | - [Run LightGBM unit tests](#run-lightgbm-unit-tests) 21 | - [Profile LightGBM code](#profiling) 22 | - [runtime profiling](#runtime-profiling) 23 | 24 | ## Getting Started 25 | 26 | To begin, clone a copy of LightGBM to a folder `LightGBM` at the root of this repo. 27 | You can do this however you want, for example: 28 | 29 | ```shell 30 | git clone \ 31 | --recursive \ 32 | git@github.com:microsoft/LightGBM.git \ 33 | ./LightGBM 34 | ``` 35 | 36 | If you're developing a reproducible example for [an issue](https://github.com/microsoft/LightGBM/issues) or you're testing a potential [pull request](https://github.com/microsoft/LightGBM/pulls), you probably want to clone LightGBM from your fork, instead of the main repo. 37 | 38 |
39 | 40 | ## Develop in Jupyter 41 | 42 | This section describes how to test a version of LightGBM in Jupyter. 43 | 44 | #### 1. Build the notebook image 45 | 46 | Run the following to build an image that includes `lightgbm`, all its dependencies, and a JupyterLab setup. 47 | 48 | ```shell 49 | make notebook-image 50 | ``` 51 | 52 | The first time you run this, it will take a few minutes as this project needs to build a base image with LightGBM's dependencies and needs to compile the LightGBM C++ library. 53 | 54 | Every time after that, `make notebook-image` should run very quickly. 55 | 56 | #### 2. Run a notebook locally 57 | 58 | Start up Jupyter Lab! 59 | This command will run Jupyter Lab in a container using the image you built with `make notebook-image`. 60 | 61 | ```shell 62 | make start-notebook 63 | ``` 64 | 65 | Navigate to `http://127.0.0.1:8888/lab` in your web browser. 66 | 67 | The command `make start-notebook` mounts your current working directory into the running container. 68 | That means that even though Jupyter Lab is running inside the container, changes that you make in it will be saved on your local filesystem even after you shut the container down. 69 | So you can edit and create notebooks and other code in there with confidence! 70 | 71 | When you're done with the notebook, stop the container by running the following from another shell: 72 | 73 | ```shell 74 | make stop-notebook 75 | ``` 76 | 77 |
78 | 79 | ## Test with a `LocalCluster` 80 | 81 | To test `lightgbm.dask` on a `LocalCluster`, run the steps in ["Develop in Jupyter"](#develop-in-jupyter), then try out [`local.ipynb`](./notebooks/local-cluster.ipynb) or your own notebooks. 82 | 83 |
84 | 85 | ## Test with a `FargateCluster` 86 | 87 | There are some problems with Dask code which only arise in a truly distributed, multi-machine setup. 88 | To test for these sorts of issues, I like to use [`dask-cloudprovider`](https://github.com/dask/dask-cloudprovider). 89 | 90 | The steps below describe how to test a local copy of LightGBM on a `FargateCluster` from `dask-cloudprovider`. 91 | 92 | #### 1. Build the cluster image 93 | 94 | Build an image that can be used for the scheduler and works in the Dask cluster you'll create on AWS Fargate. 95 | This image will have your local copy of LightGBM installed in it. 96 | 97 | ```shell 98 | make cluster-image 99 | ``` 100 | 101 | #### 2. Install and configure the AWS CLI 102 | 103 | For the rest of the steps in this section, you'll need access to AWS resources. 104 | To begin, install the AWS CLI if you don't already have it. 105 | 106 | ```shell 107 | pip install --upgrade awscli 108 | ``` 109 | 110 | Next, configure your shell to make authenticated requests to AWS. 111 | If you've never done this, you can see [the AWS CLI docs](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). 112 | 113 | The rest of this section assumes that the shell variables `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` have been sett. 114 | 115 | I like to set these by keeping them in a file 116 | 117 | ```text 118 | # file: aws.env 119 | AWS_SECRET_ACCESS_KEY=your-key-here 120 | AWS_ACCESS_KEY_ID=your-access-key-id-here 121 | ``` 122 | 123 | and then sourcing that file 124 | 125 | ```shell 126 | set -o allexport 127 | source aws.env 128 | set +o allexport 129 | ``` 130 | 131 | #### 3. Push the cluster image to ECR 132 | 133 | To use the cluster image in the containers you spin up on Fargate, it has to be available in a container registry. 134 | This project uses the free AWS Elastic Container Registry (ECR) Public. 135 | For more information on ECR Public, see [the AWS docs](https://docs.amazonaws.cn/en_us/AmazonECR/latest/public/docker-push-ecr-image.html). 136 | 137 | The command below will create a new repository on ECR Public, store the details of that repository in a file `ecr-details.json`, and push the cluster image to it. 138 | The cluster image will not contain your credentials, notebooks, or other local files. 139 | 140 | ```shell 141 | make push-image 142 | ``` 143 | 144 | This may take a few minutes to complete. 145 | 146 | #### 4. Run the AWS notebook 147 | 148 | Follow the steps in ["Develop in Jupyter"](#develop-in-jupyter) to get a local Jupyter Lab running. 149 | Open [`aws.ipynb`](./notebooks/fargate-cluster.ipynb). 150 | That notebook contains sample code that uses `dask-cloudprovider` to provision a Dask cluster on AWS Fargate. 151 | 152 | You can view the cluster's current state and its logs by navigating to the Elastic Container Service (ECS) section of the AWS console. 153 | 154 | #### 5. Clean Up 155 | 156 | As you work on whatever experiment you're doing, you'll probably find yourself wanting to repeat these steps multiple times. 157 | 158 | To remove the image you pushed to ECR Public and the repository you created there, run the following 159 | 160 | ```shell 161 | make delete-repo 162 | ``` 163 | 164 | Then, repeat the steps above to rebuild your images and test again. 165 | 166 |
167 | 168 | ## Run LightGBM unit tests 169 | 170 | This repo makes it easy to run `lightgbm`'s Dask unit tests in a containerized setup. 171 | 172 | ```shell 173 | make lightgbm-unit-tests 174 | ``` 175 | 176 | Pass variable `DASK_VERSION` to use a different version of `dask` / `distributed`. 177 | 178 | ```shell 179 | make lightgbm-unit-tests \ 180 | -e DASK_VERSION=2024.12.0 181 | ``` 182 | 183 | ## Profile LightGBM code 184 | 185 | ### runtime profiling 186 | 187 | To try to identify expensive parts of the code path for `lightgbm`, you can run its examples under `cProfile` ([link](https://docs.python.org/3/library/profile.html)) and then visualize those profiling results with `snakeviz` ([link](https://jiffyclub.github.io/snakeviz/)). 188 | 189 | ```shell 190 | make profile 191 | ``` 192 | 193 | Then navigate to `http://0.0.0.0:8080/snakeviz/%2Fprofiling-output` in your web browser. 194 | 195 | ### memory profiling 196 | 197 | To summarize memory allocations in typical uses of LightGBM, and to attribute those memory allocations to particular codepaths, you can run its examples under `memray` ([link](https://github.com/bloomberg/memray)). 198 | 199 | ```shell 200 | make profile-memory-usage 201 | ``` 202 | 203 | That will generate a bunch of HTML files. 204 | View them in your browser by running the following, then navigating to `localhost:1234`. 205 | 206 | ```shell 207 | python -m http.server \ 208 | --directory ./profiling-output/memory-usage \ 209 | 1234 210 | ``` 211 | 212 | ## Useful Links 213 | 214 | * https://github.com/microsoft/LightGBM/pull/3515 215 | * https://docs.aws.amazon.com/cli/latest/reference/ecr-public/ 216 | * https://docs.amazonaws.cn/en_us/AmazonECR/latest/public/docker-push-ecr-image.html 217 | * https://github.com/dask/dask-docker 218 | * https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html 219 | -------------------------------------------------------------------------------- /bin/install-cmake: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -u -o pipefail 4 | 5 | CMAKE_VERSION=${1} 6 | 7 | install_script="cmake-${CMAKE_VERSION}-linux-$(arch).sh" 8 | 9 | curl -O -L \ 10 | "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${install_script}" 11 | 12 | mkdir /opt/cmake 13 | sh "${install_script}" \ 14 | --skip-license \ 15 | --prefix=/opt/cmake 16 | 17 | rm "./${install_script}" 18 | 19 | ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake 20 | -------------------------------------------------------------------------------- /bin/profile-example-memory-usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # [description] 4 | # 5 | # Profile memory usage of all of LightGBM's Python examples, using memray. 6 | 7 | set -e -u -o pipefail 8 | 9 | echo "profiling examples" 10 | mkdir -p "${PROFILING_OUTPUT_DIR}/bin" 11 | 12 | # shellcheck disable=SC2044 13 | for py_script in $(find "${LIGHTGBM_HOME}/examples/python-guide" -name '*.py'); do 14 | base_filename=$(basename "${py_script}") 15 | prof_file="${base_filename/.py/.bin}" 16 | table_file="${base_filename/.py/-table.html}" 17 | leak_table_file="${base_filename/.py/-leak-table.html}" 18 | flamegraph_file="${base_filename/.py/-flamegraph.html}" 19 | echo " - ${base_filename}" 20 | memray run \ 21 | -o "${PROFILING_OUTPUT_DIR}/bin/${prof_file}" \ 22 | "${py_script}" > /dev/null 2>&1 || 23 | true 24 | memray table \ 25 | -o "${PROFILING_OUTPUT_DIR}/${table_file}" \ 26 | --force \ 27 | "${PROFILING_OUTPUT_DIR}/bin/${prof_file}" 28 | memray table \ 29 | -o "${PROFILING_OUTPUT_DIR}/${leak_table_file}" \ 30 | --force \ 31 | --leaks \ 32 | "${PROFILING_OUTPUT_DIR}/bin/${prof_file}" 33 | memray flamegraph \ 34 | -o "${PROFILING_OUTPUT_DIR}/${flamegraph_file}" \ 35 | --force \ 36 | "${PROFILING_OUTPUT_DIR}/bin/${prof_file}" 37 | done 38 | echo "Done profiling examples. See '${PROFILING_OUTPUT_DIR}' for results." 39 | -------------------------------------------------------------------------------- /bin/profile-examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # [description] 4 | # 5 | # Profile all of LightGBM's Python examples, using cProfile. 6 | 7 | set -e -u -o pipefail 8 | 9 | echo "profiling examples" 10 | # shellcheck disable=SC2044 11 | for py_script in $(find "${LIGHTGBM_HOME}/examples/python-guide" -name '*.py'); do 12 | base_filename=$(basename "${py_script}") 13 | prof_file="${base_filename/.py/.prof}" 14 | echo " - ${base_filename}" 15 | python \ 16 | -Wignore \ 17 | -m cProfile \ 18 | -o "${PROFILING_OUTPUT_DIR}/${prof_file}" \ 19 | "${py_script}" > /dev/null 2>&1 || 20 | true 21 | done 22 | echo "Done profiling examples. See '${PROFILING_OUTPUT_DIR}' for results." 23 | -------------------------------------------------------------------------------- /jupyter_notebook_config.py: -------------------------------------------------------------------------------- 1 | # mypy: disable-error-code="name-defined" 2 | c.ServerApp.token = "" 3 | c.ServerApp.password = "" 4 | c.ServerApp.open_browser = False 5 | c.ServerApp.ip = "localhost" 6 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # notebooks 2 | 3 | This directory contains notebooks used to test `lightgbm.dask`. 4 | 5 | The following notebooks can be used as tutorials for running machine learning workflows with LightGBM using Dask. 6 | 7 | * [demo.ipynb](./demo.ipynb) - Minimal example of training a regression model on a `LocalCluster`. 8 | * [demo-aws.ipynb](./demo-aws.ipynb) - Minimal example of training a regression model on AWS Fargate, using `dask-cloudprovider` 9 | 10 | ## Other notebooks in this section 11 | 12 | `testing/` contains random notebooks used to test pull requests and issues on LightGBM. Everything in that folder should be considered temporary and experimental. 13 | -------------------------------------------------------------------------------- /notebooks/_img/aws.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 9 | 10 | 31 | 32 | 34 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /notebooks/_img/dask-horizontal.svg: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | Dask 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /notebooks/_img/lightgbm.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /notebooks/demo-aws.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "tribal-xerox", 6 | "metadata": {}, 7 | "source": [ 8 | "# LightGBM + Dask\n", 9 | "\n", 10 | "\n", 11 | " \n", 12 | " \n", 15 | " \n", 18 | " \n", 21 | " \n", 22 | "
\n", 13 | " \n", 14 | " \n", 16 | " \n", 17 | " \n", 19 | " \n", 20 | "
\n", 23 | "\n", 24 | "This notebook shows how to use `lightgbm.dask` to train a LightGBM model on data stored as a [Dask Array](https://docs.dask.org/en/latest/array.html). It uses `FargateCluster` from [`dask-cloudprovider`](https://github.com/dask/dask-cloudprovider) to create a distributed cluster running on [AWS Fargate](https://aws.amazon.com/fargate/).\n", 25 | "\n", 26 | "To explore other topics in greater depth, see the other notebooks." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "amino-hunger", 32 | "metadata": {}, 33 | "source": [ 34 | "
\n", 35 | "\n", 36 | "## Set up a Dask cluster on AWS Fargate\n", 37 | "\n", 38 | "Before running any of the code in the notebook, follow the instructions in [\"Test with a FargateCluster\"](../README.md##test-with-a-fargatecluster)." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "expanded-declaration", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import json\n", 49 | "import os\n", 50 | "\n", 51 | "with open(\"../ecr-details.json\", \"r\") as f:\n", 52 | " ecr_details = json.loads(f.read())\n", 53 | "\n", 54 | "IMAGE_REPO = ecr_details[\"repository\"][\"repositoryUri\"]\n", 55 | "IMAGE_TAG = os.environ[\"IMAGE_TAG\"]\n", 56 | "IMAGE_URI = f\"{IMAGE_REPO}:{IMAGE_TAG}\"\n", 57 | "print(f\"scheduler and worker image: {IMAGE_URI}\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "id": "harmful-bosnia", 63 | "metadata": {}, 64 | "source": [ 65 | "Before proceeding, set up your AWS credentials. If you're unsure how to do this, see [the AWS docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html).\n", 66 | "\n", 67 | "Next, determine the CPU architecture of the machine you're running on.\n", 68 | "This project builds single-architecture container images matching the host system, so it's important\n", 69 | "to use the same CPU architecture on AWS Fargate." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "4a10d61c-5251-46a7-9f16-bd6eef606a82", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import platform\n", 80 | "\n", 81 | "if platform.machine().lower() in {\"aarch64\", \"arm64\"}:\n", 82 | " cpu_architecture = \"ARM64\"\n", 83 | "else:\n", 84 | " cpu_architecture = \"X86_64\"" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "complicated-little", 90 | "metadata": {}, 91 | "source": [ 92 | "Create a cluster with 3 workers. See https://cloudprovider.dask.org/en/latest/aws.html#dask_cloudprovider.aws.FargateCluster for more options." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "respective-collect", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "from dask.distributed import Client\n", 103 | "from dask_cloudprovider.aws import FargateCluster\n", 104 | "\n", 105 | "n_workers = 3\n", 106 | "cluster = FargateCluster(\n", 107 | " image=IMAGE_URI,\n", 108 | " cpu_architecture=cpu_architecture,\n", 109 | " worker_cpu=512,\n", 110 | " worker_mem=4096,\n", 111 | " n_workers=n_workers,\n", 112 | " fargate_use_private_ip=False,\n", 113 | " scheduler_timeout=\"40 minutes\",\n", 114 | ")\n", 115 | "client = Client(cluster)\n", 116 | "client.wait_for_workers(n_workers)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "raising-mauritius", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "print(f\"View the dashboard: {cluster.dashboard_link}\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "radical-composition", 132 | "metadata": {}, 133 | "source": [ 134 | "Click the link above to view a diagnostic dashboard while you run the training code below." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "modified-lincoln", 140 | "metadata": {}, 141 | "source": [ 142 | "
\n", 143 | "\n", 144 | "## Get some training data\n", 145 | "\n", 146 | "This example uses `sklearn.datasets.make_regression()` to generate a dataset in `numpy` format, then uses `dask.Array.from_array()` to turn that into a Dask Array.\n", 147 | "\n", 148 | "That's just done for convenience. `lightgbm.dask` just expects that your data are Dask Arrays or Dask DataFrames." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "structural-street", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "import dask.array as da\n", 159 | "from sklearn.datasets import make_regression\n", 160 | "\n", 161 | "X, y = make_regression(n_samples=10000, random_state=42)\n", 162 | "dX = da.from_array(X, chunks=(1000, X.shape[1]))\n", 163 | "dy = da.from_array(y, chunks=1000)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "unavailable-future", 169 | "metadata": {}, 170 | "source": [ 171 | "Right now, the Dask Arrays `data` and `labels` are lazy. Before training, you can force the cluster to compute them by running `.persist()` and then wait for that computation to finish by `wait()`-ing on them.\n", 172 | "\n", 173 | "Doing this is optional, but it will make data loading a one-time cost so subsequent runs are fast." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "quiet-nicaragua", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "from dask.distributed import wait\n", 184 | "\n", 185 | "dX = dX.persist()\n", 186 | "dy = dy.persist()\n", 187 | "_ = wait([dX, dy])" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "id": "acoustic-corner", 193 | "metadata": {}, 194 | "source": [ 195 | "
\n", 196 | "\n", 197 | "## Train a model\n", 198 | "\n", 199 | "With the data set up on the workers, train a model. `lightgbm.dask.DaskLGBMRegressor` has an interface that tries to stay as close as possible to the non-Dask scikit-learn interface to LightGBM (`lightgbm.sklearn.LGBMRegressor`)." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "pleased-brunei", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "from lightgbm.dask import DaskLGBMRegressor\n", 210 | "\n", 211 | "dask_reg = DaskLGBMRegressor(\n", 212 | " client=client,\n", 213 | " max_depth=5,\n", 214 | " objective=\"regression_l1\",\n", 215 | " learning_rate=0.1,\n", 216 | " tree_learner=\"data\",\n", 217 | " n_estimators=100,\n", 218 | " min_child_samples=1,\n", 219 | ")\n", 220 | "\n", 221 | "dask_reg.fit(\n", 222 | " X=dX,\n", 223 | " y=dy,\n", 224 | ")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "id": "designed-kidney", 230 | "metadata": {}, 231 | "source": [ 232 | "
\n", 233 | "\n", 234 | "## Evaluate the model\n", 235 | "\n", 236 | "The `.predict()` method takes in a Dask collection and returns a Dask Array." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "flexible-constitutional", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "preds = dask_reg.predict(dX)\n", 247 | "print(str(preds))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "funny-trademark", 253 | "metadata": {}, 254 | "source": [ 255 | "Before calculating the mean absolute error (MAE) of these predictions, compute some summary statistics on the target variable. This is necessary to understand what \"good\" values of MAE look like." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "id": "cross-mistake", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "p = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]\n", 266 | "dy_percentiles = da.percentile(dy, p).compute()\n", 267 | "\n", 268 | "for i, percentile in enumerate(p):\n", 269 | " print(f\"{percentile * 100}%: {round(dy_percentiles[i], 2)}\")" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "id": "offensive-switch", 275 | "metadata": {}, 276 | "source": [ 277 | "The metrics functions from `dask-ml` match those from `scikit-learn`, but take in and return Dask collections. You can use these functions to perform model evaluation without the evaluation data or predictions needing to be pulled down to the machine running this notebook. Pretty cool, right?" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "hybrid-greece", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "from dask_ml.metrics.regression import mean_absolute_error\n", 288 | "\n", 289 | "mean_absolute_error(preds, dy)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "id": "outer-region", 295 | "metadata": {}, 296 | "source": [ 297 | "## Next Steps\n", 298 | "\n", 299 | "Learn more: https://lightgbm.readthedocs.io/en/latest/Python-API.html#dask-api.\n", 300 | "\n", 301 | "Ask a question, report a bug, or submit a feature request: https://github.com/microsoft/LightGBM/issues.\n", 302 | "\n", 303 | "Contribute: https://github.com/microsoft/LightGBM/issues?q=is%3Aissue+is%3Aopen+label%3Adask." 304 | ] 305 | } 306 | ], 307 | "metadata": { 308 | "kernelspec": { 309 | "display_name": "Python 3 (ipykernel)", 310 | "language": "python", 311 | "name": "python3" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.12.8" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 5 328 | } 329 | -------------------------------------------------------------------------------- /notebooks/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "noticed-account", 6 | "metadata": {}, 7 | "source": [ 8 | "# LightGBM + Dask\n", 9 | "\n", 10 | "\n", 11 | " \n", 12 | " \n", 15 | " \n", 18 | " \n", 19 | "
\n", 13 | " \n", 14 | " \n", 16 | " \n", 17 | "
\n", 20 | "\n", 21 | "This notebook shows how to use `lightgbm.dask` to train a LightGBM model on data stored as a [Dask Array](https://docs.dask.org/en/latest/array.html).\n", 22 | "\n", 23 | "To explore other topics in greater depth, see the other notebooks." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "surprising-incentive", 29 | "metadata": {}, 30 | "source": [ 31 | "
\n", 32 | "\n", 33 | "## Set up a local Dask cluster\n", 34 | "\n", 35 | "Create a cluster with 3 workers. Since this is a `LocalCluster`, those workers are just 3 local processes." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "dietary-multimedia", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from dask.distributed import Client, LocalCluster\n", 46 | "\n", 47 | "n_workers = 3\n", 48 | "cluster = LocalCluster(n_workers=n_workers)\n", 49 | "\n", 50 | "client = Client(cluster)\n", 51 | "client.wait_for_workers(n_workers)\n", 52 | "\n", 53 | "print(f\"View the dashboard: {cluster.dashboard_link}\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "coated-paper", 59 | "metadata": {}, 60 | "source": [ 61 | "Click the link above to view a diagnostic dashboard while you run the training code below." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "confirmed-recommendation", 67 | "metadata": {}, 68 | "source": [ 69 | "
\n", 70 | "\n", 71 | "## Get some training data\n", 72 | "\n", 73 | "This example uses `sklearn.datasets.make_regression()` to generate a dataset in `numpy` format, then uses `dask.Array.from_array()` to turn that into a Dask Array.\n", 74 | "\n", 75 | "That's just done for convenience. `lightgbm.dask` just expects that your data are Dask Arrays or Dask DataFrames." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "billion-password", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import dask.array as da\n", 86 | "from sklearn.datasets import make_regression\n", 87 | "\n", 88 | "X, y = make_regression(n_samples=10000, random_state=42)\n", 89 | "dX = da.from_array(X, chunks=(1000, X.shape[1]))\n", 90 | "dy = da.from_array(y, chunks=1000)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "temporal-terrace", 96 | "metadata": {}, 97 | "source": [ 98 | "Right now, the Dask Arrays `data` and `labels` are lazy. Before training, you can force the cluster to compute them by running `.persist()` and then wait for that computation to finish by `wait()`-ing on them.\n", 99 | "\n", 100 | "Doing this is optional, but it will make data loading a one-time cost so subsequent runs are fast." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "metallic-attachment", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "from dask.distributed import wait\n", 111 | "\n", 112 | "dX = dX.persist()\n", 113 | "dy = dy.persist()\n", 114 | "_ = wait([dX, dy])" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "interpreted-central", 120 | "metadata": {}, 121 | "source": [ 122 | "
\n", 123 | "\n", 124 | "## Train a model\n", 125 | "\n", 126 | "With the data set up on the workers, train a model. `lightgbm.dask.DaskLGBMRegressor` has an interface that tries to stay as close as possible to the non-Dask scikit-learn interface to LightGBM (`lightgbm.sklearn.LGBMRegressor`)." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "animated-magnitude", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "from lightgbm.dask import DaskLGBMRegressor\n", 137 | "\n", 138 | "dask_reg = DaskLGBMRegressor(\n", 139 | " client=client,\n", 140 | " max_depth=5,\n", 141 | " objective=\"regression_l1\",\n", 142 | " learning_rate=0.1,\n", 143 | " tree_learner=\"data\",\n", 144 | " n_estimators=100,\n", 145 | " min_child_samples=1,\n", 146 | ")\n", 147 | "\n", 148 | "dask_reg.fit(\n", 149 | " X=dX,\n", 150 | " y=dy,\n", 151 | ")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "processed-karen", 157 | "metadata": {}, 158 | "source": [ 159 | "
\n", 160 | "\n", 161 | "## Evaluate the model\n", 162 | "\n", 163 | "The `.predict()` method takes in a Dask collection and returns a Dask Array." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "logical-handbook", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "preds = dask_reg.predict(dX)\n", 174 | "print(str(preds))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "prerequisite-symposium", 180 | "metadata": {}, 181 | "source": [ 182 | "Before calculating the mean absolute error (MAE) of these predictions, compute some summary statistics on the target variable. This is necessary to understand what \"good\" values of MAE look like." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "peaceful-damages", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "p = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]\n", 193 | "dy_percentiles = da.percentile(dy, p).compute()\n", 194 | "\n", 195 | "for i, percentile in enumerate(p):\n", 196 | " print(f\"{percentile * 100}%: {round(dy_percentiles[i], 2)}\")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "romantic-clone", 202 | "metadata": {}, 203 | "source": [ 204 | "The metrics functions from `dask-ml` match those from `scikit-learn`, but take in and return Dask collections. You can use these functions to perform model evaluation without the evaluation data or predictions needing to be pulled down to the machine running this notebook. Pretty cool, right?" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "considered-holocaust", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "from dask_ml.metrics.regression import mean_absolute_error\n", 215 | "\n", 216 | "mean_absolute_error(preds, dy)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "reduced-teddy", 222 | "metadata": {}, 223 | "source": [ 224 | "
\n", 225 | "\n", 226 | "## Next Steps\n", 227 | "\n", 228 | "Learn more: https://lightgbm.readthedocs.io/en/latest/Python-API.html#dask-api.\n", 229 | "\n", 230 | "Ask a question, report a bug, or submit a feature request: https://github.com/microsoft/LightGBM/issues.\n", 231 | "\n", 232 | "Contribute: https://github.com/microsoft/LightGBM/issues?q=is%3Aissue+is%3Aopen+label%3Adask." 233 | ] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "Python 3 (ipykernel)", 239 | "language": "python", 240 | "name": "python3" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.11.4" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 5 257 | } 258 | -------------------------------------------------------------------------------- /notebooks/testing/ranker-local.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Dask LightGBMRanker\n", 8 | "\n", 9 | "This notebook tests `lightgbm.dask.LGBMRanker`, proposed in https://github.com/microsoft/LightGBM/pull/3708." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import itertools\n", 19 | "\n", 20 | "import dask.array as da\n", 21 | "import dask.dataframe as dd\n", 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "from dask.distributed import Client, LocalCluster\n", 25 | "from lightgbm.dask import DaskLGBMRanker\n", 26 | "from lightgbm.sklearn import LGBMRanker\n", 27 | "from scipy.stats import spearmanr\n", 28 | "from sklearn.utils import check_random_state" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "n_workers = 4\n", 38 | "cluster = LocalCluster(n_workers=n_workers)\n", 39 | "client = Client(cluster)\n", 40 | "client.wait_for_workers(n_workers)\n", 41 | "\n", 42 | "print(f\"View the dashboard: {cluster.dashboard_link}\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "def _make_ranking(\n", 52 | " n_samples=100,\n", 53 | " n_features=20,\n", 54 | " n_informative=5,\n", 55 | " gmax=1,\n", 56 | " random_gs=False,\n", 57 | " avg_gs=10,\n", 58 | " random_state=0,\n", 59 | "):\n", 60 | " \"\"\"\n", 61 | " Generate a learning-to-rank dataset - feature vectors grouped\n", 62 | " together with integer-valued graded relevance scores. Replace this\n", 63 | " with a sklearn.datasets function if ranking objective becomes\n", 64 | " supported in sklearn.datasets module.\n", 65 | " \"\"\"\n", 66 | " rnd_generator = check_random_state(random_state)\n", 67 | "\n", 68 | " y_vec, group_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)\n", 69 | " gid = 0\n", 70 | "\n", 71 | " # build target, group ID vectors.\n", 72 | " relvalues = range(gmax + 1)\n", 73 | " while len(y_vec) < n_samples:\n", 74 | " gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)\n", 75 | " if not gsize:\n", 76 | " continue\n", 77 | "\n", 78 | " rel = rnd_generator.choice(relvalues, size=gsize, replace=True)\n", 79 | " y_vec = np.append(y_vec, rel)\n", 80 | " group_vec = np.append(group_vec, [gid] * gsize)\n", 81 | " gid += 1\n", 82 | "\n", 83 | " y_vec, group_vec = y_vec[0:n_samples], group_vec[0:n_samples]\n", 84 | "\n", 85 | " # build feature data, X. Transform first few into informative features.\n", 86 | " n_informative = max(min(n_features, n_informative), 0)\n", 87 | " x_grid = np.linspace(0, stop=1, num=gmax + 2)\n", 88 | " X = rnd_generator.uniform(size=(n_samples, n_features))\n", 89 | "\n", 90 | " # make first n_informative features values\n", 91 | " # bucketed according to relevance scores.\n", 92 | " def bucket_fn(z):\n", 93 | " return rnd_generator.uniform(x_grid[z], high=x_grid[z + 1])\n", 94 | "\n", 95 | " for j in range(n_informative):\n", 96 | " bias, coef = rnd_generator.normal(size=2)\n", 97 | " X[:, j] = bias + coef * np.apply_along_axis(bucket_fn, axis=0, arr=y_vec)\n", 98 | "\n", 99 | " return X, y_vec, group_vec\n", 100 | "\n", 101 | "\n", 102 | "def _create_ranking_data(n_samples=100, output=\"array\", chunk_size=50):\n", 103 | " X, y, g = _make_ranking(n_samples=n_samples, random_state=42)\n", 104 | " rnd = np.random.RandomState(42)\n", 105 | " w = rnd.rand(X.shape[0]) * 0.01\n", 106 | " g_rle = np.array([sum([1 for _ in grp]) for _, grp in itertools.groupby(g)])\n", 107 | "\n", 108 | " if output == \"dataframe\":\n", 109 | " # add target, weight, and group to DataFrame so that\n", 110 | " # partitions abide by group boundaries.\n", 111 | " X_df = pd.DataFrame(X, columns=[f\"feature_{i}\" for i in range(X.shape[1])])\n", 112 | " X = X_df.copy()\n", 113 | " X_df = X_df.assign(y=y, g=g, w=w)\n", 114 | "\n", 115 | " # set_index ensures partitions are based on group id.\n", 116 | " # See https://bit.ly/3pAWyNw.\n", 117 | " X_df.set_index(\"g\", inplace=True)\n", 118 | " dX = dd.from_pandas(X_df, chunksize=chunk_size)\n", 119 | "\n", 120 | " # separate target, weight from features.\n", 121 | " dy = dX[\"y\"]\n", 122 | " dw = dX[\"w\"]\n", 123 | " dX = dX.drop(columns=[\"y\", \"w\"])\n", 124 | " dg = dX.index.to_series()\n", 125 | "\n", 126 | " # encode group identifiers into run-length encoding,\n", 127 | " # the format LightGBMRanker is expecting\n", 128 | " # so that within each partition, sum(g) = n_samples.\n", 129 | " dg = dg.map_partitions(\n", 130 | " lambda p: p.groupby(\"g\", sort=False).apply(lambda z: z.shape[0])\n", 131 | " )\n", 132 | "\n", 133 | " elif output == \"array\":\n", 134 | " # ranking arrays: one chunk per group.\n", 135 | " # Each chunk must include all columns.\n", 136 | " p = X.shape[1]\n", 137 | " dX, dy, dw, dg = list(), list(), list(), list()\n", 138 | " for g_idx, rhs in enumerate(np.cumsum(g_rle)):\n", 139 | " lhs = rhs - g_rle[g_idx]\n", 140 | " dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p)))\n", 141 | " dy.append(da.from_array(y[lhs:rhs]))\n", 142 | " dw.append(da.from_array(w[lhs:rhs]))\n", 143 | " dg.append(da.from_array(np.array([g_rle[g_idx]])))\n", 144 | "\n", 145 | " dX = da.concatenate(dX, axis=0)\n", 146 | " dy = da.concatenate(dy, axis=0)\n", 147 | " dw = da.concatenate(dw, axis=0)\n", 148 | " dg = da.concatenate(dg, axis=0)\n", 149 | "\n", 150 | " else:\n", 151 | " raise ValueError(\n", 152 | " \"ranking data creation only supported for Dask arrays and dataframes\"\n", 153 | " )\n", 154 | "\n", 155 | " return X, y, w, g_rle, dX, dy, dw, dg" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Test with Dask array" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output=\"array\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "dg.compute()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "dask_ranker = DaskLGBMRanker(time_out=5, seed=42, min_child_samples=1)\n", 190 | "\n", 191 | "dask_ranker = dask_ranker.fit(X=dX, y=dy, sample_weight=dw, group=dg)\n", 192 | "rnkvec_dask = dask_ranker.predict(dX)\n", 193 | "rnkvec_dask = rnkvec_dask.compute()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "rnkvec_dask" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "local_ranker = LGBMRanker(seed=42, min_child_samples=1)\n", 212 | "local_ranker.fit(X, y, sample_weight=w, group=g)\n", 213 | "rnkvec_local = local_ranker.predict(X)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "# distributed ranker should be able to rank decently well.\n", 223 | "dcor = spearmanr(rnkvec_dask, y).correlation\n", 224 | "assert dcor > 0.6\n", 225 | "dcor" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# relative difference between distributed ranker\n", 235 | "# and local ranker spearman corr should be small.\n", 236 | "lcor = spearmanr(rnkvec_local, y).correlation\n", 237 | "print(np.abs(dcor - lcor))\n", 238 | "assert np.abs(dcor - lcor) < 0.003" 239 | ] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 3 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython3", 258 | "version": "3.8.6" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 4 263 | } 264 | -------------------------------------------------------------------------------- /notebooks/testing/sparse-inputs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook tests `lightgbm.dask`'s behavior with sparse inputs to `pred_contrib()`." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import dask.array as da\n", 17 | "import numpy as np\n", 18 | "from dask.distributed import Client, LocalCluster\n", 19 | "from lightgbm.dask import DaskLGBMClassifier\n", 20 | "from lightgbm.sklearn import LGBMClassifier\n", 21 | "from scipy.sparse import csc_matrix\n", 22 | "from sklearn.datasets import make_blobs" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "n_workers = 3\n", 32 | "cluster = LocalCluster(n_workers=n_workers)\n", 33 | "client = Client(cluster)\n", 34 | "client.wait_for_workers(n_workers)\n", 35 | "\n", 36 | "print(f\"View the dashboard: {cluster.dashboard_link}\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "chunk_size = 50\n", 46 | "X, y = make_blobs(n_samples=100, centers=3, random_state=42)\n", 47 | "rnd = np.random.RandomState(42)\n", 48 | "dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix)\n", 49 | "dy = da.from_array(y, chunks=chunk_size)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "dask_clf = DaskLGBMClassifier(n_estimators=5, num_leaves=2, tree_learner=\"data\")\n", 59 | "dask_clf.fit(dX, dy)\n", 60 | "\n", 61 | "preds = dask_clf.predict(dX, pred_contrib=True)\n", 62 | "preds_computed = preds.compute()\n", 63 | "\n", 64 | "print(\n", 65 | " type(preds),\n", 66 | " type(preds.partitions[0].compute()),\n", 67 | " type(preds_computed),\n", 68 | " f\"{dask_clf.n_classes_} classes, {dX.shape[1]} features\",\n", 69 | ")\n", 70 | "print(\"---\")\n", 71 | "print(dX.partitions[0].compute())\n", 72 | "print(\"---\")\n", 73 | "preds.compute().shape" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "preds.partitions[0].compute()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "X = dX.compute()\n", 92 | "y = dy.compute()\n", 93 | "\n", 94 | "local_clf = LGBMClassifier()\n", 95 | "local_clf.fit(X=dX.compute(), y=y)\n", 96 | "local_preds = local_clf.predict(dX.compute().tocsc(), pred_contrib=True)\n", 97 | "\n", 98 | "print(local_clf.n_classes_, type(local_preds))\n", 99 | "print(\"---\")\n", 100 | "print(local_preds)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "local_preds[0]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.8.6" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 4 141 | } 142 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff.lint] 2 | select = [ 3 | # flake8-bugbear 4 | "B", 5 | # flake8-comprehensions 6 | "C4", 7 | # pycodestyle 8 | "E", 9 | # pyflakes 10 | "F", 11 | # isort 12 | "I", 13 | # NumPy-specific rules 14 | "NPY", 15 | # pylint 16 | "PL", 17 | # flake8-return: unnecessary assignment before return 18 | "RET504", 19 | # flake8-simplify: use dict.get() instead of an if-else block 20 | "SIM401", 21 | ] 22 | 23 | [tool.ruff.lint.isort] 24 | 25 | # prevent ruff from thinking that 'lightgbm.dask' imports should 26 | # come after all others 27 | known-third-party = [ 28 | "dask", 29 | "dask_cloudprovider", 30 | "lightgbm", 31 | "pandas", 32 | "scipy", 33 | "sklearn", 34 | ] 35 | 36 | [tool.ruff.lint.per-file-ignores] 37 | "*.ipynb" = [ 38 | # (pylint) Unnecessary list() call 39 | "C408", 40 | # (pylint) too many arguments in function definition 41 | "PLR0913", 42 | # (pylint) Magic value used in comparison 43 | "PLR2004", 44 | ] 45 | "jupyter_notebook_config.py" = [ 46 | # (flake8) undefined name 47 | "F821", 48 | ] 49 | --------------------------------------------------------------------------------