├── .github └── workflows │ ├── CI.yml │ ├── cargo-build-publish.yml │ ├── docs.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── perpetual_benchmarks.rs ├── examples ├── cal_housing.rs ├── cover_types.rs └── titanic.rs ├── python-package ├── .gitignore ├── Cargo.toml ├── docs │ └── index.md ├── examples │ ├── benchmark_lgbm.py │ ├── benchmark_perpetual.py │ ├── categorical_data.ipynb │ ├── categorical_data_diamonds.ipynb │ ├── categorical_data_titanic.ipynb │ ├── fetch_openml.ipynb │ ├── lgbm_openml_sensory.ipynb │ ├── openml.ipynb │ ├── openml_mnist.ipynb │ ├── performance_benchmark.ipynb │ ├── santander.ipynb │ └── toy_datasets.ipynb ├── mkdocs.yml ├── pyproject.toml ├── python │ └── perpetual │ │ ├── __init__.py │ │ ├── booster.py │ │ ├── data.py │ │ ├── serialize.py │ │ ├── types.py │ │ └── utils.py ├── src │ ├── booster.rs │ ├── lib.rs │ ├── multi_output.rs │ └── utils.rs ├── tests │ ├── test_booster.py │ ├── test_multi_output.py │ ├── test_save_load.py │ └── test_serialize.py └── uv.lock ├── resources └── perp_logo.png ├── rust-toolchain ├── rustfmt.toml ├── scripts ├── make_resources.py ├── remove-optional-deps.py ├── run-python-tests.ps1 ├── run-python-tests.sh ├── run-single-python-test.ps1 ├── uv_script.ps1 └── uv_script.sh └── src ├── bin.rs ├── binning.rs ├── booster ├── booster.rs ├── mod.rs ├── multi_output.rs ├── predict.rs └── setters.rs ├── conformal ├── cqr.rs └── mod.rs ├── constants.rs ├── constraints.rs ├── data.rs ├── errors.rs ├── grower.rs ├── histogram.rs ├── lib.rs ├── metrics ├── classification │ ├── metrics.rs │ └── mod.rs ├── mod.rs └── regression │ ├── metrics.rs │ └── mod.rs ├── node.rs ├── objective_functions ├── adaptive_huber_loss.rs ├── huber_loss.rs ├── log_loss.rs ├── mod.rs ├── quantile_loss.rs └── squared_loss.rs ├── partial_dependence.rs ├── prune.rs ├── sampler.rs ├── shapley.rs ├── splitter.rs ├── tree ├── mod.rs ├── predict.rs └── tree.rs └── utils.rs /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: Test and Deploy 2 | on: [pull_request] 3 | 4 | jobs: 5 | windows-build-test: 6 | strategy: 7 | matrix: 8 | pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"] 9 | runs-on: "windows-latest" 10 | steps: 11 | - uses: actions/checkout@v4 12 | - name: Install latests stable Rust 13 | uses: dtolnay/rust-toolchain@stable 14 | with: 15 | toolchain: stable 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.pyversion }} 19 | architecture: x64 20 | - name: Install deps 21 | run: pip install numpy pandas seaborn scikit-learn toml 22 | - run: | 23 | cp README.md python-package/README.md 24 | cp LICENSE python-package/LICENSE 25 | - name: Build test data 26 | run: | 27 | cd python-package 28 | python -m pip install -e .[dev] 29 | cd .. 30 | python scripts/make_resources.py 31 | - name: Build wheels with maturin 32 | uses: PyO3/maturin-action@v1 33 | with: 34 | target: x86_64 35 | command: build 36 | args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist 37 | - name: Install wheel 38 | run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall 39 | - name: Run Package Tests 40 | run: | 41 | pip install pytest pytest-cov black ruff setuptools --upgrade 42 | cd python-package 43 | ruff check . 44 | black --check . 45 | pytest --cov-fail-under=90 tests 46 | cd .. 47 | - name: Save Artifacts 48 | uses: actions/upload-artifact@v4 49 | with: 50 | name: dist-windows-${{ matrix.pyversion }} 51 | path: dist 52 | 53 | macos-build-test: 54 | strategy: 55 | matrix: 56 | pyversion: ["3.11", "3.12", "3.13"] 57 | os: [macos-latest, macos-latest-large] 58 | runs-on: ${{ matrix.os }} 59 | steps: 60 | - uses: actions/checkout@v4 61 | - name: Install latest stable Rust 62 | uses: dtolnay/rust-toolchain@stable 63 | with: 64 | toolchain: stable 65 | - uses: actions/setup-python@v5 66 | with: 67 | python-version: ${{ matrix.pyversion }} 68 | - name: Install deps 69 | run: pip install numpy pandas seaborn scikit-learn toml 70 | - run: | 71 | cp README.md python-package/README.md 72 | cp LICENSE python-package/LICENSE 73 | - name: Build test data 74 | run: | 75 | cd python-package 76 | python -m pip install -e .[dev] 77 | cd .. 78 | python scripts/make_resources.py 79 | - name: Build wheels with maturin 80 | uses: PyO3/maturin-action@v1 81 | with: 82 | command: build 83 | args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist 84 | - name: Install wheel 85 | run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall 86 | - name: Run Package Tests 87 | run: | 88 | pip install pytest pytest-cov black ruff setuptools --upgrade 89 | cd python-package 90 | ruff check . 91 | black --check . 92 | pytest --cov-fail-under=90 tests 93 | cd .. 94 | - name: Save Artifacts 95 | uses: actions/upload-artifact@v4 96 | with: 97 | name: dist-${{ matrix.os }}-${{ matrix.pyversion }} 98 | path: dist 99 | 100 | linux-build-test: 101 | runs-on: ubuntu-latest 102 | strategy: 103 | matrix: 104 | pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"] 105 | steps: 106 | - uses: actions/checkout@v4 107 | - name: Install latests stable Rust 108 | uses: dtolnay/rust-toolchain@stable 109 | with: 110 | toolchain: stable 111 | - uses: actions/setup-python@v5 112 | with: 113 | python-version: ${{ matrix.pyversion }} 114 | architecture: x64 115 | - name: Install deps 116 | run: pip install numpy pandas seaborn scikit-learn toml 117 | - run: | 118 | cp README.md python-package/README.md 119 | cp LICENSE python-package/LICENSE 120 | - name: Build test data 121 | run: | 122 | cd python-package 123 | python -m pip install -e .[dev] 124 | cd .. 125 | python scripts/make_resources.py 126 | - name: Build wheels with maturin 127 | uses: PyO3/maturin-action@v1 128 | with: 129 | target: x86_64 130 | manylinux: auto 131 | command: build 132 | args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist 133 | - name: Install wheel 134 | run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall 135 | - name: Run Package Tests 136 | run: | 137 | pip install pytest pytest-cov black ruff setuptools --upgrade 138 | cd python-package 139 | ruff check . 140 | black --check . 141 | pytest --cov-fail-under=90 tests 142 | cd .. 143 | - name: Save Artifacts 144 | uses: actions/upload-artifact@v4 145 | with: 146 | name: dist-linux-${{ matrix.pyversion }} 147 | path: dist 148 | 149 | linux-arm-build-test: 150 | runs-on: ubuntu-24.04-arm 151 | strategy: 152 | matrix: 153 | pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"] 154 | steps: 155 | - uses: actions/checkout@v4 156 | - name: Install latests stable Rust 157 | uses: dtolnay/rust-toolchain@stable 158 | with: 159 | toolchain: stable 160 | - uses: actions/setup-python@v5 161 | with: 162 | python-version: ${{ matrix.pyversion }} 163 | architecture: arm64 164 | - name: Install deps 165 | run: pip install numpy pandas seaborn scikit-learn toml 166 | - run: | 167 | cp README.md python-package/README.md 168 | cp LICENSE python-package/LICENSE 169 | - name: Build test data 170 | run: | 171 | cd python-package 172 | python -m pip install -e .[dev] 173 | cd .. 174 | python scripts/make_resources.py 175 | - name: Build wheels with maturin 176 | uses: PyO3/maturin-action@v1 177 | with: 178 | manylinux: auto 179 | command: build 180 | args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist 181 | - name: Install wheel 182 | run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall 183 | - name: Run Package Tests 184 | run: | 185 | pip install pytest pytest-cov black ruff setuptools --upgrade 186 | cd python-package 187 | ruff check . 188 | black --check . 189 | pytest --cov-fail-under=90 tests 190 | cd .. 191 | - name: Save Artifacts 192 | uses: actions/upload-artifact@v4 193 | with: 194 | name: dist-linux-arm-${{ matrix.pyversion }} 195 | path: dist 196 | 197 | cargo-build-test: 198 | runs-on: ubuntu-latest 199 | steps: 200 | - uses: actions/checkout@v4 201 | - name: Install latest stable Rust 202 | uses: dtolnay/rust-toolchain@stable 203 | with: 204 | toolchain: stable 205 | - uses: actions/setup-python@v5 206 | with: 207 | python-version: "3.11" 208 | architecture: x64 209 | - name: Install deps 210 | run: pip install numpy pandas seaborn scikit-learn toml 211 | - run: | 212 | cp README.md python-package/README.md 213 | cp LICENSE python-package/LICENSE 214 | - name: Build test data 215 | run: | 216 | cd python-package 217 | python -m pip install -e .[dev] 218 | cd .. 219 | python scripts/make_resources.py 220 | - name: Run tests 221 | run: cargo test --verbose 222 | -------------------------------------------------------------------------------- /.github/workflows/cargo-build-publish.yml: -------------------------------------------------------------------------------- 1 | name: Cargo Build Publish 2 | on: [workflow_dispatch] 3 | 4 | jobs: 5 | cargo-build-test: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v3 9 | - name: Install latests stable Rust 10 | uses: dtolnay/rust-toolchain@stable 11 | with: 12 | toolchain: stable 13 | - uses: actions/setup-python@v4 14 | with: 15 | python-version: "3.10" 16 | architecture: x64 17 | - name: Install deps 18 | run: pip install numpy pandas seaborn scikit-learn toml 19 | - run: | 20 | cp README.md python-package/README.md 21 | cp LICENSE python-package/LICENSE 22 | - name: Update TOML 23 | run: python scripts/remove-optional-deps.py 24 | - name: Build test data 25 | run: python scripts/make_resources.py 26 | - name: Run tests 27 | run: cargo test --verbose 28 | - name: Publish Crate 29 | run: cargo publish --token ${CRATES_TOKEN} --allow-dirty 30 | env: 31 | CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }} 32 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | release: 4 | types: [published] 5 | 6 | permissions: 7 | contents: write 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - uses: actions/setup-python@v4 14 | with: 15 | python-version: 3.x 16 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 17 | - uses: actions/cache@v3 18 | with: 19 | key: mkdocs-material-${{ env.cache_id }} 20 | path: .cache 21 | restore-keys: | 22 | mkdocs-material- 23 | - run: | 24 | cp README.md python-package/README.md 25 | cp LICENSE python-package/LICENSE 26 | - run: pip install mkdocs-material 27 | - run: | 28 | cd python-package 29 | pip install .[dev] 30 | mkdocs gh-deploy --force 31 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | windows-build: 8 | strategy: 9 | matrix: 10 | pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | runs-on: "windows-latest" 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Install latests stable Rust 15 | uses: dtolnay/rust-toolchain@stable 16 | with: 17 | toolchain: stable 18 | - uses: actions/setup-python@v5 19 | with: 20 | python-version: ${{ matrix.pyversion }} 21 | architecture: x64 22 | - name: Install deps 23 | run: pip install numpy pandas seaborn scikit-learn toml 24 | - run: | 25 | cp README.md python-package/README.md 26 | cp LICENSE python-package/LICENSE 27 | - name: Build wheels with maturin 28 | uses: PyO3/maturin-action@v1 29 | with: 30 | target: x86_64 31 | command: build 32 | args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist 33 | - name: Save Artifacts 34 | uses: actions/upload-artifact@v4 35 | with: 36 | name: dist-windows-${{ matrix.pyversion }} 37 | path: dist 38 | 39 | macos-build: 40 | strategy: 41 | matrix: 42 | pyversion: ["3.11", "3.12", "3.13"] 43 | os: [macos-latest, macos-latest-large] 44 | runs-on: ${{ matrix.os }} 45 | steps: 46 | - uses: actions/checkout@v4 47 | - name: Install latest stable Rust 48 | uses: dtolnay/rust-toolchain@stable 49 | with: 50 | toolchain: stable 51 | - uses: actions/setup-python@v5 52 | with: 53 | python-version: ${{ matrix.pyversion }} 54 | - name: Install deps 55 | run: pip install numpy pandas seaborn scikit-learn toml 56 | - run: | 57 | cp README.md python-package/README.md 58 | cp LICENSE python-package/LICENSE 59 | - name: Build wheels with maturin 60 | uses: PyO3/maturin-action@v1 61 | with: 62 | command: build 63 | args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist 64 | - name: Save Artifacts 65 | uses: actions/upload-artifact@v4 66 | with: 67 | name: dist-${{ matrix.os }}-${{ matrix.pyversion }} 68 | path: dist 69 | 70 | linux-build: 71 | runs-on: ubuntu-latest 72 | strategy: 73 | matrix: 74 | pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"] 75 | steps: 76 | - uses: actions/checkout@v4 77 | - name: Install latests stable Rust 78 | uses: dtolnay/rust-toolchain@stable 79 | with: 80 | toolchain: stable 81 | - uses: actions/setup-python@v5 82 | with: 83 | python-version: ${{ matrix.pyversion }} 84 | architecture: x64 85 | - name: Install deps 86 | run: pip install numpy pandas seaborn scikit-learn toml 87 | - run: | 88 | cp README.md python-package/README.md 89 | cp LICENSE python-package/LICENSE 90 | - name: Build wheels with maturin 91 | uses: PyO3/maturin-action@v1 92 | with: 93 | target: x86_64 94 | manylinux: auto 95 | command: build 96 | args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist 97 | - name: Save Artifacts 98 | uses: actions/upload-artifact@v4 99 | with: 100 | name: dist-linux-${{ matrix.pyversion }} 101 | path: dist 102 | 103 | linux-arm-build: 104 | runs-on: ubuntu-24.04-arm 105 | strategy: 106 | matrix: 107 | pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"] 108 | steps: 109 | - uses: actions/checkout@v4 110 | - name: Install latests stable Rust 111 | uses: dtolnay/rust-toolchain@stable 112 | with: 113 | toolchain: stable 114 | - uses: actions/setup-python@v5 115 | with: 116 | python-version: ${{ matrix.pyversion }} 117 | architecture: arm64 118 | - name: Install deps 119 | run: pip install numpy pandas seaborn scikit-learn toml 120 | - run: | 121 | cp README.md python-package/README.md 122 | cp LICENSE python-package/LICENSE 123 | - name: Build wheels with maturin 124 | uses: PyO3/maturin-action@v1 125 | with: 126 | manylinux: auto 127 | command: build 128 | args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist 129 | - name: Save Artifacts 130 | uses: actions/upload-artifact@v4 131 | with: 132 | name: dist-linux-arm-${{ matrix.pyversion }} 133 | path: dist 134 | 135 | cargo-publish: 136 | runs-on: ubuntu-latest 137 | steps: 138 | - uses: actions/checkout@v4 139 | - name: Install latest stable Rust 140 | uses: dtolnay/rust-toolchain@stable 141 | with: 142 | toolchain: stable 143 | - uses: actions/setup-python@v5 144 | with: 145 | python-version: "3.11" 146 | architecture: x64 147 | - name: Install deps 148 | run: pip install numpy pandas seaborn scikit-learn toml 149 | - run: | 150 | cp README.md python-package/README.md 151 | cp LICENSE python-package/LICENSE 152 | - name: Publish Crate 153 | run: cargo publish --token ${CRATES_TOKEN} --allow-dirty 154 | env: 155 | CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }} 156 | 157 | pypi-publish: 158 | runs-on: ubuntu-latest 159 | needs: ["windows-build", "macos-build", "linux-build", "linux-arm-build"] 160 | environment: 161 | name: Test and Deploy 162 | url: https://pypi.org/p/perpetual 163 | permissions: 164 | id-token: write 165 | steps: 166 | - name: Retrieve release distributions 167 | uses: actions/download-artifact@v4 168 | with: 169 | pattern: dist-* 170 | merge-multiple: true 171 | path: dist 172 | - name: Publish release distributions to PyPI 173 | uses: pypa/gh-action-pypi-publish@release/v1 174 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | perpetual/__pycache__ 4 | .vscode/ 5 | .venv 6 | resources/* 7 | !resources/perp_logo.png 8 | python-package/Cargo.lock 9 | python-package/LICENSE 10 | python-package/README.md 11 | python-package/target 12 | python-package/python/perpetual/__pycache__ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/doublify/pre-commit-rust 3 | rev: "v1.0" 4 | hooks: 5 | - id: fmt 6 | - id: cargo-check 7 | - id: clippy 8 | - repo: https://github.com/pycqa/isort 9 | rev: "5.12.0" 10 | hooks: 11 | - id: isort 12 | - repo: https://github.com/psf/black 13 | rev: "22.6.0" 14 | hooks: 15 | - id: black 16 | - repo: https://github.com/astral-sh/ruff-pre-commit 17 | # Ruff version. 18 | rev: v0.0.277 19 | hooks: 20 | - id: ruff -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `perpetual` 2 | 3 | ## Development Setup 4 | 5 | For development, it is assumed you have stable rust installed, and at least python 3.9. Additionally, in your python environment, you will need to install [`maturin`](https://github.com/PyO3/maturin). 6 | 7 | ### To run Rust tests 8 | 9 | The rust test depend on some artifacts that are generated from a python script. You can either follow the steps in the python tests section, or run the following in an environment that is running python. 10 | 11 | ```sh 12 | cd python-package 13 | # Install the project in editable mode and all development dependencies 14 | python -m pip install -e .[dev] 15 | # You can now return to the rood directory and run the tests... 16 | cd .. 17 | python -m pip install pandas seaborn 18 | python scripts/make_resources.py 19 | ``` 20 | 21 | If you have rust and the cargo package manager installed all you need to do to run the rust tests is in the root of the repository run the following command. 22 | 23 | ```sh 24 | cargo test 25 | ``` 26 | 27 | ### To run the python tests 28 | 29 | Prior to running the tests, you should install `python-package` in editable mode. To do this, from the project root directory you can run the following. 30 | 31 | ```sh 32 | cd python-package 33 | # Install the project in editable mode and all development dependencies 34 | python -m pip install -e .[dev] 35 | # You can now return to the root directory and run the tests... 36 | cd .. 37 | 38 | # Prior to running the tests, build all required test artifacts 39 | python scripts/make_resources.py 40 | 41 | # Now you can run the tests. 42 | # on Linux... 43 | source scripts/run-python-tests.sh 44 | ``` 45 | 46 | The test script can also be run from powershell. 47 | 48 | ```powershell 49 | # on Windows (powershell) 50 | .\scripts\run-python-tests.ps1 51 | ``` 52 | 53 | This script, builds the package in release mode, installs it, and then runs the test. Because of this, it is useful to run this whenever you want to test out a change in the python package. 54 | 55 | ## Benchmarking 56 | 57 | Benchmarking is run using the [`criterion`](https://github.com/bheisler/criterion.rs) Rust crate. 58 | To run the benchmarks, you can run the following command from your terminal. 59 | 60 | ```sh 61 | cargo bench 62 | ``` 63 | 64 | specific benchmarks can be targeted by referring to them by name. 65 | 66 | ```sh 67 | cargo bench "fast sum" 68 | ``` 69 | 70 | ## Pre-commit 71 | 72 | The [`pre-commit`](https://pre-commit.com/) framework should be installed and used to ensure all commits meet the required formatting, and linting checks prior to a commit being made to the repository. 73 | 74 | ```sh 75 | # Install pre-commit, either right in your default python install 76 | # or using a tool such as pipx (https://pypa.github.io/pipx/) 77 | python -m pip install pre-commit 78 | 79 | # In the root of the repository 80 | pre-commit install 81 | ``` 82 | 83 | ## Serialization 84 | 85 | The saving and loading of the model is all handled by the [`serde`](https://docs.rs/serde/1.0.163/serde/) and [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) crates. 86 | 87 | Because of this you will see the following attribue calls sprinkled throughout the package. 88 | 89 | ```rust 90 | #[derive(Deserialize, Serialize)] 91 | ``` 92 | 93 | Additionally in order to not break backwards compatibility with models saved in previous versions, any new items added to the `Tree` or `PerpetualBooster` struts, should have a default value defined. This way models can be loaded, even if they were saved before the new fields was added. 94 | A default value can be added for a fields using the `#[#[serde(default = "default_sample_method")]]` attribute. Where the string that default is referring to must be the name of a valid function, the following is a complete example of this. 95 | 96 | ```rust 97 | use crate::sampler::SampleMethod, Sampler; 98 | 99 | #[derive(Deserialize, Serialize)] 100 | pub struct PerpetualBooster { 101 | // ... 102 | #[serde(default = "default_sample_method")] 103 | pub sample_method: SampleMethod, 104 | // ... 105 | } 106 | 107 | fn default_sample_method() -> SampleMethod { 108 | SampleMethod::None 109 | } 110 | ``` 111 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "perpetual" 3 | version = "0.9.3" 4 | edition = "2021" 5 | authors = ["Mutlu Simsek "] 6 | homepage = "https://perpetual-ml.com" 7 | description = "A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization" 8 | license-file = "LICENSE" 9 | readme = "README.md" 10 | repository = "https://github.com/perpetual-ml/perpetual" 11 | 12 | keywords = ["machine-learning", "perpetual", "ai", "ml"] 13 | categories = ["algorithms", "mathematics", "science"] 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | [profile.release] 17 | lto = 'fat' 18 | codegen-units = 1 19 | #debug = true # due to flamegraph 20 | #strip = false # due to flamegraph 21 | 22 | [dependencies] 23 | rayon = "1.10.0" 24 | thiserror = "2.0.12" 25 | serde_json = { version = "1.0.140", features = ["float_roundtrip"] } 26 | serde = { version = "1.0.219", features = ["derive"] } 27 | approx = "0.5.1" 28 | log = "0.4.27" 29 | rand = "0.9.0" 30 | sysinfo = "0.33.1" 31 | 32 | [dev-dependencies] 33 | criterion = "0.5.1" 34 | polars = "0.41" 35 | reqwest = { version = "0.12.14", features = ["blocking"] } 36 | csv = "1.3.1" 37 | chrono = "0.4.40" 38 | 39 | [[bench]] 40 | name = "perpetual_benchmarks" 41 | harness = false 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |
6 | 7 | [![Python Versions](https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white)](https://pypi.org/project/perpetual) 8 | [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual) 9 | [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual) 10 | [![Static Badge](https://img.shields.io/badge/join-discord-blue?logo=discord)](https://discord.gg/AyUK7rr6wy) 11 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual) 12 | 13 | 14 |
15 | 16 | # Perpetual 17 | 18 | PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data. 19 | 20 | ## Usage 21 | 22 | You can use the algorithm like in the example below. Check examples folders for both Rust and Python. 23 | 24 | ```python 25 | from perpetual import PerpetualBooster 26 | 27 | model = PerpetualBooster(objective="SquaredLoss", budget=0.5) 28 | model.fit(X, y) 29 | ``` 30 | 31 | ## Documentation 32 | 33 | Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/). 34 | 35 | 36 | ## Benchmark 37 | 38 | ### PerpetualBooster vs. Optuna + LightGBM 39 | 40 | Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets. 41 | 42 | The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression): 43 | 44 | | Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time | 45 | | ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- | 46 | | 1.0 | 100 | 0.192 | 0.192 | 54x | 56x | 47 | | 1.5 | 300 | 0.188 | 0.188 | 59x | 58x | 48 | | 2.1 | 1000 | 0.185 | 0.186 | 42x | 41x | 49 | 50 | The following table summarizes the results for the [Cover Types](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html) dataset (classification): 51 | 52 | | Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Speed-up wall time | Speed-up cpu time | 53 | | ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- | 54 | | 0.9 | 100 | 0.091 | 0.084 | 72x | 78x | 55 | 56 | The results can be reproduced using the scripts in the [examples](./python-package/examples) folder. 57 | 58 | ### PerpetualBooster vs. AutoGluon 59 | 60 | PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/) for both regression and classification tasks. 61 | 62 | The results are summarized in the following table for regression tasks: 63 | 64 | | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE | 65 | | -------------------------------------------------------- | ----- | ----- | ------------------- | -------- | ------ | ------------------ | 66 | | [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518 | 11.3 | 29.0 | 520 | 30.9 | 28.8 | 67 | | [bates_regr_100](https://www.openml.org/t/361940) | 3421 | 15.1 | 1.084 | OOM | OOM | OOM | 68 | | [BNG(libras_move)](https://www.openml.org/t/7327) | 1956 | 4.2 | 2.51 | 1922 | 97.6 | 2.53 | 69 | | [BNG(satellite_image)](https://www.openml.org/t/7326) | 334 | 1.6 | 0.731 | 337 | 10.0 | 0.721 | 70 | | [COMET_MC](https://www.openml.org/t/14949) | 44 | 1.0 | 0.0615 | 47 | 5.0 | 0.0662 | 71 | | [friedman1](https://www.openml.org/t/361939) | 275 | 4.2 | 1.047 | 278 | 5.1 | 1.487 | 72 | | [poker](https://www.openml.org/t/10102) | 38 | 0.6 | 0.256 | 41 | 1.2 | 0.722 | 73 | | [subset_higgs](https://www.openml.org/t/361955) | 868 | 10.6 | 0.420 | 870 | 24.5 | 0.421 | 74 | | [BNG(autoHorse)](https://www.openml.org/t/7319) | 107 | 1.1 | 19.0 | 107 | 3.2 | 20.5 | 75 | | [BNG(pbc)](https://www.openml.org/t/7318) | 48 | 0.6 | 836.5 | 51 | 0.2 | 957.1 | 76 | | average | 465 | 3.9 | - | 464 | 19.7 | - | 77 | 78 | PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster. 79 | 80 | The results are summarized in the following table for classification tasks: 81 | 82 | | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC | 83 | | -------------------------------------------------------- | ------- | ------ | ------------------- | -------- | ------ | ------------------ | 84 | | [BNG(spambase)](https://www.openml.org/t/146163) | 70.1 | 2.1 | 0.671 | 73.1 | 3.7 | 0.669 | 85 | | [BNG(trains)](https://www.openml.org/t/208) | 89.5 | 1.7 | 0.996 | 106.4 | 2.4 | 0.994 | 86 | | [breast](https://www.openml.org/t/361942) | 13699.3 | 97.7 | 0.991 | 13330.7 | 79.7 | 0.949 | 87 | | [Click_prediction_small](https://www.openml.org/t/7291) | 89.1 | 1.0 | 0.749 | 101.0 | 2.8 | 0.703 | 88 | | [colon](https://www.openml.org/t/361938) | 12435.2 | 126.7 | 0.997 | 12356.2 | 152.3 | 0.997 | 89 | | [Higgs](https://www.openml.org/t/362113) | 3485.3 | 40.9 | 0.843 | 3501.4 | 67.9 | 0.816 | 90 | | [SEA(50000)](https://www.openml.org/t/230) | 21.9 | 0.2 | 0.936 | 25.6 | 0.5 | 0.935 | 91 | | [sf-police-incidents](https://www.openml.org/t/359994) | 85.8 | 1.5 | 0.687 | 99.4 | 2.8 | 0.659 | 92 | | [bates_classif_100](https://www.openml.org/t/361941) | 11152.8 | 50.0 | 0.864 | OOM | OOM | OOM | 93 | | [prostate](https://www.openml.org/t/361945) | 13699.9 | 79.8 | 0.987 | OOM | OOM | OOM | 94 | | average | 3747.0 | 34.0 | - | 3699.2 | 39.0 | - | 95 | 96 | PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster. 97 | 98 | PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks. 99 | 100 | The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark). 101 | 102 | 103 | 104 | ## Installation 105 | 106 | The package can be installed directly from [pypi](https://pypi.org/project/perpetual): 107 | 108 | ```shell 109 | pip install perpetual 110 | ``` 111 | 112 | Using [conda-forge](https://anaconda.org/conda-forge/perpetual): 113 | 114 | ```shell 115 | conda install conda-forge::perpetual 116 | ``` 117 | 118 | To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual): 119 | 120 | ```shell 121 | cargo add perpetual 122 | ``` 123 | 124 | ## Contribution 125 | 126 | Contributions are welcome. Check CONTRIBUTING.md for the guideline. 127 | 128 | ## Paper 129 | 130 | PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm. 131 | -------------------------------------------------------------------------------- /benches/perpetual_benchmarks.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 2 | use perpetual::binning::bin_matrix; 3 | use perpetual::constraints::ConstraintMap; 4 | use perpetual::data::Matrix; 5 | use perpetual::histogram::{NodeHistogram, NodeHistogramOwned}; 6 | use perpetual::objective_functions::{LogLoss, ObjectiveFunction}; 7 | use perpetual::splitter::{MissingImputerSplitter, SplitInfo, SplitInfoSlice}; 8 | use perpetual::tree::tree::Tree; 9 | use perpetual::utils::{fast_f64_sum, fast_sum, naive_sum}; 10 | use perpetual::PerpetualBooster; 11 | use std::fs; 12 | use std::time::Duration; 13 | 14 | pub fn tree_benchmarks(c: &mut Criterion) { 15 | let file = fs::read_to_string("resources/contiguous_no_missing_100k_samp_seed0.csv") 16 | .expect("Something went wrong reading the file"); 17 | let data_vec: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); 18 | let file = 19 | fs::read_to_string("resources/performance_100k_samp_seed0.csv").expect("Something went wrong reading the file"); 20 | let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); 21 | let yhat = vec![0.5; y.len()]; 22 | let (mut g, mut h) = LogLoss::calc_grad_hess(&y, &yhat, None, None); 23 | let loss = LogLoss::calc_loss(&y, &yhat, None, None); 24 | 25 | let v: Vec = vec![10.; 300000]; 26 | c.bench_function("Niave Sum", |b| b.iter(|| naive_sum(black_box(&v)))); 27 | c.bench_function("fast sum", |b| b.iter(|| fast_sum(black_box(&v)))); 28 | c.bench_function("fast f64 sum", |b| b.iter(|| fast_f64_sum(black_box(&v)))); 29 | 30 | c.bench_function("calc_grad_hess", |b| { 31 | b.iter(|| LogLoss::calc_grad_hess(black_box(&y), black_box(&yhat), black_box(None), black_box(None))) 32 | }); 33 | 34 | let data = Matrix::new(&data_vec, y.len(), 5); 35 | let splitter = MissingImputerSplitter::new(0.3, true, ConstraintMap::new()); 36 | let mut tree = Tree::new(); 37 | 38 | let bindata = bin_matrix(&data, None, 300, f64::NAN, None).unwrap(); 39 | let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols); 40 | let col_index: Vec = (0..data.cols).collect(); 41 | 42 | let n_nodes_alloc = 100; 43 | 44 | let mut hist_tree_owned: Vec = (0..n_nodes_alloc) 45 | .map(|_| NodeHistogramOwned::empty_from_cuts(&bindata.cuts, &col_index, false, true)) 46 | .collect(); 47 | 48 | let mut hist_tree: Vec = hist_tree_owned 49 | .iter_mut() 50 | .map(|node_hist| NodeHistogram::from_owned(node_hist)) 51 | .collect(); 52 | 53 | let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap(); 54 | 55 | let mut split_info_vec: Vec = (0..col_index.len()).map(|_| SplitInfo::default()).collect(); 56 | let split_info_slice = SplitInfoSlice::new(&mut split_info_vec); 57 | 58 | tree.fit( 59 | &bdata, 60 | data.index.to_owned(), 61 | &col_index, 62 | &mut g, 63 | h.as_deref_mut(), 64 | &splitter, 65 | &pool, 66 | Some(f32::MAX), 67 | &loss, 68 | &y, 69 | LogLoss::calc_loss, 70 | &yhat, 71 | None, 72 | None, 73 | false, 74 | &mut hist_tree, 75 | None, 76 | &split_info_slice, 77 | n_nodes_alloc, 78 | ); 79 | 80 | println!("{}", tree.nodes.len()); 81 | c.bench_function("Train Tree", |b| { 82 | b.iter(|| { 83 | let mut train_tree: Tree = Tree::new(); 84 | 85 | train_tree.fit( 86 | black_box(&bdata), 87 | black_box(data.index.to_owned()), 88 | black_box(&col_index), 89 | black_box(&mut g), 90 | black_box(h.as_deref_mut()), 91 | black_box(&splitter), 92 | black_box(&pool), 93 | Some(f32::MAX), 94 | black_box(&loss), 95 | black_box(&y), 96 | black_box(LogLoss::calc_loss), 97 | black_box(&yhat), 98 | None, 99 | None, 100 | false, 101 | black_box(&mut hist_tree), 102 | None, 103 | black_box(&split_info_slice), 104 | n_nodes_alloc, 105 | ); 106 | }) 107 | }); 108 | c.bench_function("Train Tree - column subset", |b| { 109 | b.iter(|| { 110 | let mut train_tree: Tree = Tree::new(); 111 | 112 | train_tree.fit( 113 | black_box(&bdata), 114 | black_box(data.index.to_owned()), 115 | black_box(&[1, 3, 4]), 116 | black_box(&mut g), 117 | black_box(h.as_deref_mut()), 118 | black_box(&splitter), 119 | black_box(&pool), 120 | Some(f32::MAX), 121 | black_box(&loss), 122 | black_box(&y), 123 | black_box(LogLoss::calc_loss), 124 | black_box(&yhat), 125 | None, 126 | None, 127 | false, 128 | black_box(&mut hist_tree), 129 | None, 130 | black_box(&split_info_slice), 131 | n_nodes_alloc, 132 | ); 133 | }) 134 | }); 135 | c.bench_function("Tree Predict (Single Threaded)", |b| { 136 | b.iter(|| tree.predict(black_box(&data), black_box(false), black_box(&f64::NAN))) 137 | }); 138 | c.bench_function("Tree Predict (Multi Threaded)", |b| { 139 | b.iter(|| tree.predict(black_box(&data), black_box(true), black_box(&f64::NAN))) 140 | }); 141 | 142 | // Gradient Booster 143 | // Bench building 144 | let mut booster_train = c.benchmark_group("train_booster"); 145 | booster_train.warm_up_time(Duration::from_secs(10)); 146 | booster_train.sample_size(50); 147 | // booster_train.sampling_mode(SamplingMode::Linear); 148 | booster_train.bench_function("train_booster_default", |b| { 149 | b.iter(|| { 150 | let mut booster = PerpetualBooster::default().set_budget(0.3); 151 | booster.fit(black_box(&data), black_box(&y), black_box(None)).unwrap(); 152 | }) 153 | }); 154 | booster_train.bench_function("train_booster_with_column_sampling", |b| { 155 | b.iter(|| { 156 | let mut booster = PerpetualBooster::default().set_budget(0.3); 157 | booster.fit(black_box(&data), black_box(&y), black_box(None)).unwrap(); 158 | }) 159 | }); 160 | let mut booster = PerpetualBooster::default().set_budget(0.1); 161 | booster.fit(&data, &y, None).unwrap(); 162 | booster_train.bench_function("Predict Booster", |b| { 163 | b.iter(|| booster.predict(black_box(&data), false)) 164 | }); 165 | } 166 | 167 | criterion_group!(benches, tree_benchmarks); 168 | criterion_main!(benches); 169 | -------------------------------------------------------------------------------- /examples/cal_housing.rs: -------------------------------------------------------------------------------- 1 | //! An example using the `california housing` dataset 2 | 3 | // cargo run --release --example cal_housing 1.0 1 4 | 5 | // cargo build --release --example cal_housing 6 | // hyperfine --runs 3 ./target/release/examples/cal_housing 7 | // hyperfine --runs 3 .\target\release\examples\cal_housing 8 | // hyperfine --runs 11 'cargo run --release --example cal_housing 0.1 0.3 2' 9 | // hyperfine --runs 11 'cargo run --release --example cal_housing 2.0' 10 | 11 | // cargo flamegraph --example cal_housing 12 | 13 | use perpetual::{objective_functions::Objective, Matrix, PerpetualBooster}; 14 | use polars::prelude::*; 15 | use std::env; 16 | use std::error::Error; 17 | use std::time::SystemTime; 18 | 19 | pub fn mse(y_test: &[f64], y_pred: &[f64]) -> f32 { 20 | let mut error = 0.0; 21 | for i in 0..y_test.len() { 22 | error += (y_test[i] - y_pred[i]) * (y_test[i] - y_pred[i]); 23 | } 24 | let e = error / y_test.len() as f64; 25 | e as f32 26 | } 27 | 28 | fn main() -> Result<(), Box> { 29 | let args: Vec = env::args().collect(); 30 | let budget = &args[1].parse::().unwrap_or(1.0); 31 | let num_threads = &args[2].parse::().unwrap_or(1); 32 | 33 | let all_names = [ 34 | "MedInc".to_string(), 35 | "HouseAge".to_string(), 36 | "AveRooms".to_string(), 37 | "AveBedrms".to_string(), 38 | "Population".to_string(), 39 | "AveOccup".to_string(), 40 | "Latitude".to_string(), 41 | "Longitude".to_string(), 42 | "MedHouseVal".to_string(), 43 | ]; 44 | 45 | let feature_names = [ 46 | "MedInc".to_string(), 47 | "HouseAge".to_string(), 48 | "AveRooms".to_string(), 49 | "AveBedrms".to_string(), 50 | "Population".to_string(), 51 | "AveOccup".to_string(), 52 | "Latitude".to_string(), 53 | "Longitude".to_string(), 54 | ]; 55 | 56 | let column_names_train = Arc::new(all_names.clone()); 57 | let column_names_test = Arc::new(all_names.clone()); 58 | 59 | let df_train = CsvReadOptions::default() 60 | .with_has_header(true) 61 | .with_columns(Some(column_names_train)) 62 | .try_into_reader_with_file_path(Some("resources/cal_housing_train.csv".into()))? 63 | .finish() 64 | .unwrap(); 65 | 66 | let df_test = CsvReadOptions::default() 67 | .with_has_header(true) 68 | .with_columns(Some(column_names_test)) 69 | .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))? 70 | .finish() 71 | .unwrap(); 72 | 73 | // Get data in column major format... 74 | let id_vars_train: Vec<&str> = Vec::new(); 75 | let mdf_train = df_train.unpivot(feature_names.clone(), &id_vars_train)?; 76 | let id_vars_test: Vec<&str> = Vec::new(); 77 | let mdf_test = df_test.unpivot(feature_names, &id_vars_test)?; 78 | 79 | let data_train = Vec::from_iter( 80 | mdf_train 81 | .select_at_idx(1) 82 | .expect("Invalid column") 83 | .f64()? 84 | .into_iter() 85 | .map(|v| v.unwrap_or(f64::NAN)), 86 | ); 87 | let data_test = Vec::from_iter( 88 | mdf_test 89 | .select_at_idx(1) 90 | .expect("Invalid column") 91 | .f64()? 92 | .into_iter() 93 | .map(|v| v.unwrap_or(f64::NAN)), 94 | ); 95 | 96 | let y_train = Vec::from_iter( 97 | df_train 98 | .column("MedHouseVal")? 99 | .cast(&DataType::Float64)? 100 | .f64()? 101 | .into_iter() 102 | .map(|v| v.unwrap_or(f64::NAN)), 103 | ); 104 | let y_test = Vec::from_iter( 105 | df_test 106 | .column("MedHouseVal")? 107 | .cast(&DataType::Float64)? 108 | .f64()? 109 | .into_iter() 110 | .map(|v| v.unwrap_or(f64::NAN)), 111 | ); 112 | 113 | // Create Matrix from ndarray. 114 | let matrix_train = Matrix::new(&data_train, y_train.len(), 8); 115 | let matrix_test = Matrix::new(&data_test, y_test.len(), 8); 116 | 117 | // Create booster. 118 | // To provide parameters generate a default booster, and then use 119 | // the relevant `set_` methods for any parameters you would like to 120 | // adjust. 121 | let mut model = PerpetualBooster::default() 122 | .set_objective(Objective::SquaredLoss) 123 | .set_num_threads(Some(*num_threads)) 124 | .set_budget(*budget); 125 | 126 | let now = SystemTime::now(); 127 | model.fit(&matrix_train, &y_train, None)?; 128 | println!("now.elapsed: {:?}", now.elapsed().unwrap().as_secs_f32()); 129 | 130 | let trees = model.get_prediction_trees(); 131 | println!("n_rounds: {:?}", trees.len()); 132 | 133 | let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum(); 134 | println!("n_leaves: {:?}", n_leaves); 135 | 136 | let y_pred = model.predict(&matrix_train, true); 137 | let error = mse(&y_train, &y_pred); 138 | println!("mse_train: {:?}", error); 139 | 140 | let y_pred = model.predict(&matrix_test, true); 141 | let error = mse(&y_test, &y_pred); 142 | println!("mse_test: {:?}", error); 143 | 144 | println!("tree:"); 145 | for t in trees { 146 | println!("{}", t); 147 | } 148 | 149 | Ok(()) 150 | } 151 | -------------------------------------------------------------------------------- /examples/cover_types.rs: -------------------------------------------------------------------------------- 1 | //! An example using the `cover types` dataset 2 | 3 | // cargo run --release --example cover_types 1.0 4 | 5 | // cargo build --release --example cover_types 6 | // hyperfine --runs 3 ./target/release/examples/cover_types 7 | // hyperfine --runs 3 .\target\release\examples\cover_types 1.0 8 | // hyperfine --runs 3 'cargo run --release --example cover_types 1.0' 9 | 10 | // cargo flamegraph --example cover_types 11 | 12 | use perpetual::{objective_functions::Objective, Matrix, PerpetualBooster}; 13 | use polars::prelude::*; 14 | use std::env; 15 | use std::error::Error; 16 | 17 | pub fn mse(y_test: &[f64], y_pred: &[f64]) -> f32 { 18 | let mut error = 0.0; 19 | for i in 0..y_test.len() { 20 | error += (y_test[i] - y_pred[i]) * (y_test[i] - y_pred[i]); 21 | } 22 | let e = error / y_test.len() as f64; 23 | e as f32 24 | } 25 | 26 | pub fn multiclass_log_loss(y_true: &[f64], y_pred: &[Vec]) -> f64 { 27 | let mut losses = vec![0.0; y_true.len()]; 28 | let eps = 1e-11; 29 | for (i, y_p) in y_pred.iter().enumerate() { 30 | let y_p_exp = y_p.iter().map(|e| e.exp()).collect::>(); 31 | let y_p_exp_sum = y_p_exp.iter().sum::(); 32 | let probabilities = y_p_exp.iter().map(|e| e / y_p_exp_sum).collect::>(); 33 | let cls_idx = (y_true[i] - 1.0) as usize; 34 | let p = f64::max(eps, f64::min(1.0 - eps, probabilities[cls_idx])); 35 | losses[i] = -1.0 * p.ln(); 36 | } 37 | losses.iter().sum::() / losses.len() as f64 38 | } 39 | 40 | fn main() -> Result<(), Box> { 41 | let args: Vec = env::args().collect(); 42 | let budget = &args[1].parse::().unwrap_or(1.0); 43 | 44 | let mut features: Vec<&str> = [ 45 | "Elevation", 46 | "Aspect", 47 | "Slope", 48 | "Horizontal_Distance_To_Hydrology", 49 | "Vertical_Distance_To_Hydrology", 50 | "Horizontal_Distance_To_Roadways", 51 | "Hillshade_9am", 52 | "Hillshade_Noon", 53 | "Hillshade_3pm", 54 | "Horizontal_Distance_To_Fire_Points", 55 | "Wilderness_Area_0", 56 | "Wilderness_Area_1", 57 | "Wilderness_Area_2", 58 | "Wilderness_Area_3", 59 | ] 60 | .to_vec(); 61 | 62 | let soil_types = (0..40).map(|i| format!("{}_{}", "Soil_Type", i)).collect::>(); 63 | let s_types = soil_types.iter().map(|s| s.as_str()).collect::>(); 64 | features.extend(s_types); 65 | 66 | let mut features_and_target = features.clone(); 67 | features_and_target.push("Cover_Type"); 68 | 69 | let features_and_target_arc1 = features_and_target 70 | .iter() 71 | .map(|s| String::from(s.to_owned())) 72 | .collect::>() 73 | .into(); 74 | 75 | let features_and_target_arc2 = features_and_target 76 | .iter() 77 | .map(|s| String::from(s.to_owned())) 78 | .collect::>() 79 | .into(); 80 | 81 | let df_train = CsvReadOptions::default() 82 | .with_has_header(true) 83 | .with_columns(Some(features_and_target_arc1)) 84 | .try_into_reader_with_file_path(Some("resources/cover_types_train.csv".into()))? 85 | .finish() 86 | .unwrap(); 87 | 88 | let df_test = CsvReadOptions::default() 89 | .with_has_header(true) 90 | .with_columns(Some(features_and_target_arc2)) 91 | .try_into_reader_with_file_path(Some("resources/cover_types_test.csv".into()))? 92 | .finish() 93 | .unwrap(); 94 | 95 | // Get data in column major format... 96 | let id_vars_train: Vec<&str> = Vec::new(); 97 | let mdf_train = df_train.unpivot(&features, &id_vars_train)?; 98 | let id_vars_test: Vec<&str> = Vec::new(); 99 | let mdf_test = df_test.unpivot(&features, &id_vars_test)?; 100 | 101 | let data_train = Vec::from_iter( 102 | mdf_train 103 | .select_at_idx(1) 104 | .expect("Invalid column") 105 | .f64()? 106 | .into_iter() 107 | .map(|v| v.unwrap_or(f64::NAN)), 108 | ); 109 | let data_test = Vec::from_iter( 110 | mdf_test 111 | .select_at_idx(1) 112 | .expect("Invalid column") 113 | .f64()? 114 | .into_iter() 115 | .map(|v| v.unwrap_or(f64::NAN)), 116 | ); 117 | 118 | let y_train = Vec::from_iter( 119 | df_train 120 | .column("Cover_Type")? 121 | .cast(&DataType::Float64)? 122 | .f64()? 123 | .into_iter() 124 | .map(|v| v.unwrap_or(f64::NAN)), 125 | ); 126 | let y_test = Vec::from_iter( 127 | df_test 128 | .column("Cover_Type")? 129 | .cast(&DataType::Float64)? 130 | .f64()? 131 | .into_iter() 132 | .map(|v| v.unwrap_or(f64::NAN)), 133 | ); 134 | 135 | // Create Matrix from ndarray. 136 | let matrix_train = Matrix::new(&data_train, y_train.len(), 54); 137 | let matrix_test = Matrix::new(&data_test, y_test.len(), 54); 138 | 139 | let mut raw_train_array = vec![vec![0.0; 7]; y_train.len()]; 140 | let mut raw_test_array = vec![vec![0.0; 7]; y_test.len()]; 141 | for i in 1..8 { 142 | println!(); 143 | 144 | let mut model = PerpetualBooster::default() 145 | .set_objective(Objective::LogLoss) 146 | .set_budget(*budget); 147 | 148 | let y_tr: Vec = y_train 149 | .iter() 150 | .map(|y| if (*y as i32) == i { 1.0 } else { 0.0 }) 151 | .collect(); 152 | 153 | model.fit(&matrix_train, &y_tr, None)?; 154 | println!("Completed fitting model number: {}", i); 155 | 156 | let trees = model.get_prediction_trees(); 157 | println!("n_rounds: {:?}", trees.len()); 158 | 159 | let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum(); 160 | println!("n_leaves: {:?}", n_leaves); 161 | 162 | let y_pred_train = model.predict(&matrix_train, true); 163 | let y_pred_test = model.predict(&matrix_test, true); 164 | 165 | raw_train_array 166 | .iter_mut() 167 | .enumerate() 168 | .for_each(|(idx, raw)| raw[(i - 1) as usize] = y_pred_train[idx]); 169 | raw_test_array 170 | .iter_mut() 171 | .enumerate() 172 | .for_each(|(idx, raw)| raw[(i - 1) as usize] = y_pred_test[idx]); 173 | } 174 | 175 | let loss_train = multiclass_log_loss(&y_train, &raw_train_array); 176 | let loss_test = multiclass_log_loss(&y_test, &raw_test_array); 177 | 178 | println!("loss_train: {}", loss_train); 179 | println!("loss_test: {}", loss_test); 180 | 181 | Ok(()) 182 | } 183 | -------------------------------------------------------------------------------- /examples/titanic.rs: -------------------------------------------------------------------------------- 1 | //! An example using the `titanic` dataset 2 | use perpetual::objective_functions::Objective; 3 | use perpetual::{Matrix, PerpetualBooster}; 4 | use polars::prelude::*; 5 | use std::env; 6 | use std::error::Error; 7 | 8 | fn main() -> Result<(), Box> { 9 | let args: Vec = env::args().collect(); 10 | let budget = &args[1].parse::().unwrap(); 11 | 12 | let features_and_target = ["survived", "pclass", "age", "sibsp", "parch", "fare"]; 13 | 14 | let features_and_target_arc = features_and_target 15 | .iter() 16 | .map(|s| String::from(s.to_owned())) 17 | .collect::>() 18 | .into(); 19 | 20 | let df = CsvReadOptions::default() 21 | .with_has_header(true) 22 | .with_columns(Some(features_and_target_arc)) 23 | .try_into_reader_with_file_path(Some("resources/titanic.csv".into()))? 24 | .finish() 25 | .unwrap(); 26 | 27 | // Get data in column major format... 28 | let id_vars: Vec<&str> = Vec::new(); 29 | let mdf = df.unpivot(["pclass", "age", "sibsp", "parch", "fare"], id_vars)?; 30 | 31 | let data = Vec::from_iter( 32 | mdf.select_at_idx(1) 33 | .expect("Invalid column") 34 | .f64()? 35 | .into_iter() 36 | .map(|v| v.unwrap_or(f64::NAN)), 37 | ); 38 | let y = Vec::from_iter( 39 | df.column("survived")? 40 | .cast(&DataType::Float64)? 41 | .f64()? 42 | .into_iter() 43 | .map(|v| v.unwrap_or(f64::NAN)), 44 | ); 45 | 46 | // Create Matrix from ndarray. 47 | let matrix = Matrix::new(&data, y.len(), 5); 48 | 49 | // Create booster. 50 | // To provide parameters generate a default booster, and then use 51 | // the relevant `set_` methods for any parameters you would like to 52 | // adjust. 53 | let mut model = PerpetualBooster::default() 54 | .set_objective(Objective::LogLoss) 55 | .set_budget(*budget); 56 | model.fit(&matrix, &y, None)?; 57 | 58 | println!("Model prediction: {:?} ...", &model.predict(&matrix, true)[0..10]); 59 | 60 | Ok(()) 61 | } 62 | -------------------------------------------------------------------------------- /python-package/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | .pytest_cache/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | .venv/ 14 | env/ 15 | bin/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | include/ 26 | man/ 27 | venv/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | pip-selfcheck.json 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | 45 | # Translations 46 | *.mo 47 | 48 | # Mr Developer 49 | .mr.developer.cfg 50 | .project 51 | .pydevproject 52 | 53 | # Rope 54 | .ropeproject 55 | 56 | # Django stuff: 57 | *.log 58 | *.pot 59 | 60 | .DS_Store 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyCharm 66 | .idea/ 67 | 68 | # VSCode 69 | .vscode/ 70 | 71 | # Pyenv 72 | .python-version -------------------------------------------------------------------------------- /python-package/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "py-perpetual" 3 | version = "0.9.3" 4 | edition = "2021" 5 | authors = ["Mutlu Simsek "] 6 | homepage = "https://perpetual-ml.com" 7 | description = "A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization" 8 | license-file = "LICENSE" 9 | readme = "README.md" 10 | repository = "https://github.com/perpetual-ml/perpetual" 11 | 12 | keywords = ["machine-learning", "perpetual", "ai", "ml"] 13 | categories = ["algorithms", "mathematics", "science"] 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | [lib] 17 | name = "perpetual" 18 | crate-type = ["cdylib", "rlib"] 19 | 20 | [dependencies] 21 | pyo3 = { version = "0.24.1", features = ["extension-module"] } 22 | perpetual_rs = {package="perpetual", version = "0.9.3", path = "../" } 23 | numpy = "0.24.0" 24 | ndarray = "0.16.1" 25 | serde_plain = { version = "1.0.2" } 26 | serde = { version = "1.0.219" } 27 | pyo3-log = "0.12.3" 28 | -------------------------------------------------------------------------------- /python-package/docs/index.md: -------------------------------------------------------------------------------- 1 | # Perpetual 2 | 3 | ## Python API Reference 4 | 5 | PyPI - Version 6 | 7 | Crates.io Version 8 | 9 | The `PerpetualBooster` class is currently the only public facing class in the package, and can be used to train gradient boosted decision tree ensembles with multiple objective functions. 10 | 11 | ::: perpetual.PerpetualBooster 12 | 13 | ## Logging output 14 | 15 | Info is logged while the model is being trained if the `log_iterations` parameter is set to a value greater than `0` while fitting the booster. The logs can be printed to stdout while training like so. 16 | 17 | ```python 18 | import logging 19 | logging.basicConfig() 20 | logging.getLogger().setLevel(logging.INFO) 21 | 22 | model = PerpetualBooster(log_iterations=1) 23 | model.fit(X, y) 24 | 25 | # INFO:perpetual.perpetualbooster:Completed iteration 0 of 10 26 | # INFO:perpetual.perpetualbooster:Completed iteration 1 of 10 27 | # INFO:perpetual.perpetualbooster:Completed iteration 2 of 10 28 | ``` 29 | 30 | The log output can also be captured in a file also using the `logging.basicConfig()` `filename` option. 31 | 32 | ```python 33 | import logging 34 | logging.basicConfig(filename="training-info.log") 35 | logging.getLogger().setLevel(logging.INFO) 36 | 37 | model = PerpetualBooster(log_iterations=10) 38 | model.fit(X, y) 39 | ``` 40 | -------------------------------------------------------------------------------- /python-package/examples/benchmark_lgbm.py: -------------------------------------------------------------------------------- 1 | import optuna 2 | import numpy as np 3 | from time import process_time, time 4 | from functools import partial 5 | from lightgbm import LGBMRegressor, LGBMClassifier 6 | from sklearn.metrics import mean_squared_error, log_loss 7 | from sklearn.datasets import fetch_covtype, fetch_california_housing 8 | from sklearn.model_selection import train_test_split, cross_validate 9 | 10 | 11 | def prepare_data(cal_housing, seed): 12 | if cal_housing: 13 | data, target = fetch_california_housing(return_X_y=True, as_frame=True) 14 | scoring = "neg_mean_squared_error" 15 | metric_function = mean_squared_error 16 | metric_name = "mse" 17 | LGBMBooster = LGBMRegressor 18 | else: 19 | data, target = fetch_covtype(return_X_y=True, as_frame=True) 20 | scoring = "neg_log_loss" 21 | metric_function = log_loss 22 | metric_name = "log_loss" 23 | LGBMBooster = LGBMClassifier 24 | X_train, X_test, y_train, y_test = train_test_split( 25 | data, target, test_size=0.2248, random_state=seed 26 | ) 27 | return ( 28 | X_train, 29 | X_test, 30 | y_train, 31 | y_test, 32 | scoring, 33 | metric_function, 34 | metric_name, 35 | LGBMBooster, 36 | ) 37 | 38 | 39 | best_cv_results = None 40 | cv_results = None 41 | 42 | 43 | def save_best_cv_results(study, trial): 44 | global best_cv_results 45 | if study.best_trial.number == trial.number: 46 | best_cv_results = cv_results 47 | 48 | 49 | def objective_function( 50 | trial, seed, n_estimators, LGBMBooster, X_train, y_train, scoring 51 | ): 52 | global cv_results 53 | params = { 54 | "seed": seed, 55 | "verbosity": -1, 56 | "n_estimators": n_estimators, 57 | "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True), 58 | "min_split_gain": trial.suggest_float("min_split_gain", 1e-6, 1.0, log=True), 59 | "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 1.0, log=True), 60 | "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 1.0, log=True), 61 | "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0), 62 | "subsample": trial.suggest_float("subsample", 0.2, 1.0), 63 | "subsample_freq": trial.suggest_int("subsample_freq", 1, 10), 64 | "max_depth": trial.suggest_int("max_depth", 3, 33), 65 | "num_leaves": trial.suggest_int("num_leaves", 2, 1024), 66 | "min_child_samples": trial.suggest_int("min_child_samples", 1, 100), 67 | } 68 | model = LGBMBooster(**params) 69 | cv_results = cross_validate( 70 | model, 71 | X_train, 72 | y_train, 73 | cv=5, 74 | scoring=scoring, 75 | return_train_score=True, 76 | return_estimator=True, 77 | ) 78 | return -1 * np.mean(cv_results["test_score"]) 79 | 80 | 81 | if __name__ == "__main__": 82 | optuna.logging.set_verbosity(optuna.logging.WARNING) 83 | cal_housing = True # True -> California Housing, False -> Cover Types 84 | n_estimators = 100 85 | n_trials = 100 86 | cpu_times = [] 87 | wall_times = [] 88 | metrics = [] 89 | 90 | for seed in range(5): 91 | ( 92 | X_train, 93 | X_test, 94 | y_train, 95 | y_test, 96 | scoring, 97 | metric_function, 98 | metric_name, 99 | LGBMBooster, 100 | ) = prepare_data(cal_housing, seed) 101 | 102 | sampler = optuna.samplers.TPESampler(seed=seed) 103 | study = optuna.create_study(direction="minimize", sampler=sampler) 104 | 105 | obj = partial( 106 | objective_function, 107 | seed=seed, 108 | n_estimators=n_estimators, 109 | LGBMBooster=LGBMBooster, 110 | X_train=X_train, 111 | y_train=y_train, 112 | scoring=scoring, 113 | ) 114 | 115 | start = process_time() 116 | tick = time() 117 | study.optimize(obj, n_trials=n_trials, callbacks=[save_best_cv_results]) 118 | stop = process_time() 119 | cpu_times.append(stop - start) 120 | wall_times.append(time() - tick) 121 | 122 | models = best_cv_results["estimator"] 123 | if metric_name == "log_loss": 124 | y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0) 125 | else: 126 | y_pred = np.mean([model.predict(X_test) for model in models], axis=0) 127 | metric = metric_function(y_test, y_pred) 128 | metrics.append(metric) 129 | 130 | print(f"seed: {seed}, cpu time: {stop - start}, {metric_name}: {metric}") 131 | 132 | print(f"avg cpu time: {np.mean(cpu_times)}, avg {metric_name}: {np.mean(metrics)}") 133 | print(f"avg wall time: {np.mean(wall_times)}") 134 | print(f"cpu time / wall time: {(np.mean(cpu_times)/np.mean(wall_times)):.1f}") 135 | -------------------------------------------------------------------------------- /python-package/examples/benchmark_perpetual.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from time import process_time, time 3 | from perpetual import PerpetualBooster 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import mean_squared_error, log_loss 6 | from sklearn.datasets import fetch_covtype, fetch_california_housing 7 | from importlib.metadata import version 8 | 9 | 10 | def prepare_data(cal_housing, seed): 11 | if cal_housing: 12 | data, target = fetch_california_housing(return_X_y=True, as_frame=True) 13 | metric_function = mean_squared_error 14 | metric_name = "mse" 15 | objective = "SquaredLoss" 16 | else: 17 | data, target = fetch_covtype(return_X_y=True, as_frame=True) 18 | metric_function = log_loss 19 | metric_name = "log_loss" 20 | objective = "LogLoss" 21 | X_train, X_test, y_train, y_test = train_test_split( 22 | data, target, test_size=0.2248, random_state=seed 23 | ) 24 | return X_train, X_test, y_train, y_test, metric_function, metric_name, objective 25 | 26 | 27 | if __name__ == "__main__": 28 | print(f"perpetual: {version('perpetual')}") 29 | budget = 1.0 30 | num_threads = 2 31 | cal_housing = True # True -> California Housing, False -> Cover Types 32 | cpu_times = [] 33 | wall_times = [] 34 | metrics = [] 35 | 36 | for seed in range(5): 37 | X_train, X_test, y_train, y_test, metric_function, metric_name, objective = ( 38 | prepare_data(cal_housing, seed) 39 | ) 40 | 41 | model = PerpetualBooster( 42 | objective=objective, num_threads=num_threads, log_iterations=0 43 | ) 44 | 45 | start = process_time() 46 | tick = time() 47 | model.fit(X_train, y_train, budget=budget) 48 | stop = process_time() 49 | cpu_times.append(stop - start) 50 | wall_times.append(time() - tick) 51 | 52 | if metric_name == "log_loss": 53 | y_pred = model.predict_proba(X_test) 54 | else: 55 | y_pred = model.predict(X_test) 56 | metric = metric_function(y_test, y_pred) 57 | metrics.append(metric) 58 | 59 | print(f"seed: {seed}, cpu time: {stop - start}, {metric_name}: {metric}") 60 | 61 | print(f"avg cpu time: {np.mean(cpu_times)}, avg {metric_name}: {np.mean(metrics)}") 62 | print(f"avg wall time: {np.mean(wall_times)}") 63 | print(f"cpu time / wall time: {(np.mean(cpu_times)/np.mean(wall_times)):.1f}") 64 | -------------------------------------------------------------------------------- /python-package/examples/categorical_data_titanic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "from scipy.special import expit\n", 13 | "from lightgbm import LGBMClassifier\n", 14 | "from sklearn.metrics import log_loss, accuracy_score\n", 15 | "from sklearn.model_selection import train_test_split\n", 16 | "from perpetual import PerpetualBooster" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "pd.set_option('display.max_rows', 1000)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!python --version" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from importlib.metadata import version\n", 44 | "\n", 45 | "print(f\"numpy: {version('numpy')}\")\n", 46 | "print(f\"optuna: {version('optuna')}\")\n", 47 | "print(f\"lightgbm: {version('lightgbm')}\")\n", 48 | "print(f\"scikit-learn: {version('scikit-learn')}\")\n", 49 | "print(f\"perpetual: {version('perpetual')}\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "df = sns.load_dataset(\"titanic\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "df.drop(columns=[\"alive\"], inplace=True)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "X = df.drop(columns=[\"survived\"])\n", 77 | "y = df[\"survived\"]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "X.shape" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "X.dtypes" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "X.nunique()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "X.head()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "X[\"sex\"] = pd.get_dummies(X[\"sex\"], drop_first=True, dtype=float).to_numpy()\n", 123 | "X[\"adult_male\"] = pd.get_dummies(X[\"adult_male\"], drop_first=True, dtype=float).to_numpy()\n", 124 | "# X[\"alive\"] = pd.get_dummies(X[\"alive\"], drop_first=True, dtype=float).to_numpy()\n", 125 | "X[\"alone\"] = pd.get_dummies(X[\"alone\"], drop_first=True, dtype=float).to_numpy()\n", 126 | "cols = ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town', 'age', 'fare']\n", 127 | "X[cols] = X[cols].astype('category')\n", 128 | "X.head()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 13, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "seed = 42\n", 138 | "n_estimators = 100\n", 139 | "n_trials = 1" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 14, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "scoring = \"neg_log_loss\"\n", 149 | "metric_function = log_loss\n", 150 | "metric_name = \"log_loss\"\n", 151 | "objective_type = \"LogLoss\"" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)\n", 161 | "\n", 162 | "print(f\"X_train.shape: {X_train.shape}\")\n", 163 | "print(f\"X_test.shape: {X_test.shape}\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "X_train.head()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "set(X_train[\"who\"])" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "model_lgbm = LGBMClassifier(objective=\"binary\")\n", 191 | "model_lgbm.fit(X_train, y_train)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "model = PerpetualBooster(objective=\"LogLoss\")\n", 201 | "model.fit(X_train, y_train, budget=0.1)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "y_pred = np.round(expit(model.predict(X_test)))\n", 211 | "print(accuracy_score(y_test, y_pred))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "y_pred = np.round(expit(model.predict(X_train)))\n", 221 | "print(accuracy_score(y_train, y_pred))" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "if metric_name == \"log_loss\":\n", 231 | " y_pred = expit(model.predict(X_test))\n", 232 | "else:\n", 233 | " y_pred = np.round(expit(model.predict(X_test)))\n", 234 | "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 23, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "df_trees = model.trees_to_dataframe()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "df_trees.head(10)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "model_lgbm.booster_.trees_to_dataframe().head(10)" 262 | ] 263 | } 264 | ], 265 | "metadata": { 266 | "kernelspec": { 267 | "display_name": "py311", 268 | "language": "python", 269 | "name": "python3" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.11.9" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 2 286 | } 287 | -------------------------------------------------------------------------------- /python-package/examples/fetch_openml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import logging\n", 10 | "from perpetual import PerpetualBooster\n", 11 | "from sklearn.datasets import fetch_openml\n", 12 | "from importlib.metadata import version" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "logging.basicConfig(level=logging.INFO)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "print(f\"perpetual: {version('perpetual')}\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "data, target = fetch_openml(data_id=45667, return_X_y=True, as_frame=True)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "data.shape" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "data.dtypes" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "model = PerpetualBooster(objective=\"SquaredLoss\", log_iterations=1)\n", 67 | "model.fit(data, target, budget=0.5)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "model.number_of_trees" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "py311", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.11.9" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /python-package/examples/lgbm_openml_sensory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from sklearn.datasets import fetch_openml\n", 11 | "from lightgbm import LGBMRegressor" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "pd.set_option('display.max_rows', 500)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df = fetch_openml(data_id=546)\n", 30 | "X = df.data\n", 31 | "y = df.target" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "model = LGBMRegressor(n_estimators=10, max_depth=1, learning_rate=0.1, max_cat_to_onehot=1, cat_l2=0.0, cat_smooth=0.0, min_data_per_group=1, max_cat_threshold=1000)\n", 41 | "model.fit(X, y)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "X.dtypes" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "list(X.columns)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "df = model.booster_.trees_to_dataframe()\n", 69 | "df_mod = df.loc[df[\"weight\"] == 0]\n", 70 | "df_mod.head(10)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "list(df_mod[\"split_gain\"])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "model.booster_.trees_to_dataframe()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "from xgboost import XGBRegressor" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "xgb = XGBRegressor(n_estimators=10, learning_rate=0.1, max_depth=1, enable_categorical=True, max_cat_to_onehot=1, max_cat_threshold=100, reg_alpha=0.0, reg_lambda=0.0)\n", 107 | "xgb.fit(X, y)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "xgb.get_booster().trees_to_dataframe()" 117 | ] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "py311", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.11.9" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 2 141 | } 142 | -------------------------------------------------------------------------------- /python-package/examples/openml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from scipy.io import arff\n", 12 | "from perpetual import PerpetualBooster" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from importlib.metadata import version\n", 22 | "\n", 23 | "print(f\"numpy: {version('numpy')}\")\n", 24 | "print(f\"pandas: {version('pandas')}\")\n", 25 | "print(f\"scipy: {version('scipy')}\")\n", 26 | "print(f\"perpetual: {version('perpetual')}\")" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "arff_file = arff.loadarff('../../resources/christine.arff')\n", 36 | "df = pd.DataFrame(arff_file[0])" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "df.head()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df.shape" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "y = df.pop('class')\n", 64 | "y" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "y = np.array(y).astype(int)\n", 74 | "y" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "df.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "df.dtypes" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "idx = [i for i, e in enumerate(list(df.dtypes)) if e==np.dtype('O')]\n", 102 | "cat_features = np.array(df.columns)[idx]\n", 103 | "print(cat_features)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 11, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "df[cat_features] = df[cat_features].astype(\"category\")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "from perpetual.utils import convert_input_frame\n", 122 | "\n", 123 | "features_, df_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(df, \"auto\", 1000)\n", 124 | "\n", 125 | "pd.Series(df_flat).to_csv(\"../../resources/christine_flat.csv\", index=False, header=False)\n", 126 | "pd.Series(y).to_csv(\"../../resources/christine_y.csv\", index=False, header=False)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 13, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "model = PerpetualBooster(log_iterations=1)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "model.fit(df, y, budget=0.1)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "model.number_of_trees" 154 | ] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "py311", 160 | "language": "python", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.11.9" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 2 178 | } 179 | -------------------------------------------------------------------------------- /python-package/examples/openml_mnist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from perpetual import PerpetualBooster" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import logging\n", 20 | "\n", 21 | "logging.basicConfig()\n", 22 | "logging.getLogger().setLevel(logging.DEBUG)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "X_train = pd.read_csv(\"../../resources/fashion_train_flat.csv\", index_col=False, header=None).to_numpy().reshape(63000, -1)\n", 32 | "X_train.shape" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "y_train = pd.read_csv(\"../../resources/fashion_train_y.csv\", index_col=False, header=None).to_numpy().flatten()\n", 42 | "y_train.shape" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "model = PerpetualBooster(log_iterations=1)\n", 52 | "model.fit(X_train, y_train, budget=1.0, timeout=360)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "model.number_of_trees" 62 | ] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "py311", 68 | "language": "python", 69 | "name": "python3" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.11.9" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 2 86 | } 87 | -------------------------------------------------------------------------------- /python-package/examples/santander.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "967af9d9", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2024-10-21T07:01:07.130508Z", 12 | "iopub.status.busy": "2024-10-21T07:01:07.130061Z", 13 | "iopub.status.idle": "2024-10-21T07:01:08.048111Z", 14 | "shell.execute_reply": "2024-10-21T07:01:08.046970Z" 15 | }, 16 | "papermill": { 17 | "duration": 0.926499, 18 | "end_time": "2024-10-21T07:01:08.050965", 19 | "exception": false, 20 | "start_time": "2024-10-21T07:01:07.124466", 21 | "status": "completed" 22 | }, 23 | "tags": [] 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "from perpetual import PerpetualBooster\n", 30 | "from sklearn.metrics import roc_auc_score\n", 31 | "from sklearn.model_selection import KFold\n", 32 | "from autogluon.tabular import TabularPredictor" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "id": "c257f8fc", 39 | "metadata": { 40 | "execution": { 41 | "iopub.execute_input": "2024-10-21T07:01:28.223537Z", 42 | "iopub.status.busy": "2024-10-21T07:01:28.222764Z", 43 | "iopub.status.idle": "2024-10-21T07:01:34.667262Z", 44 | "shell.execute_reply": "2024-10-21T07:01:34.666013Z" 45 | }, 46 | "papermill": { 47 | "duration": 6.453134, 48 | "end_time": "2024-10-21T07:01:34.670004", 49 | "exception": false, 50 | "start_time": "2024-10-21T07:01:28.216870", 51 | "status": "completed" 52 | }, 53 | "tags": [] 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "X_train = pd.read_csv('../../resources/santander-train.csv', index_col=0)\n", 58 | "y_train = X_train.pop('TARGET')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "e0590d0a", 65 | "metadata": { 66 | "execution": { 67 | "iopub.execute_input": "2024-10-21T07:01:34.680697Z", 68 | "iopub.status.busy": "2024-10-21T07:01:34.680290Z", 69 | "iopub.status.idle": "2024-10-21T07:01:34.689412Z", 70 | "shell.execute_reply": "2024-10-21T07:01:34.688210Z" 71 | }, 72 | "papermill": { 73 | "duration": 0.017414, 74 | "end_time": "2024-10-21T07:01:34.691792", 75 | "exception": false, 76 | "start_time": "2024-10-21T07:01:34.674378", 77 | "status": "completed" 78 | }, 79 | "tags": [] 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "X_train.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "id": "22eba1d7", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "cv = KFold(shuffle=True, random_state=42)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "921f491f", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "scores = []\n", 104 | "for train, test in cv.split(X_train, y_train):\n", 105 | " model = PerpetualBooster(objective=\"LogLoss\")\n", 106 | " model.fit(X_train.iloc[train], y_train.iloc[train], budget=1.0)\n", 107 | " probabilities = model.predict_proba(X_train.iloc[test])\n", 108 | " score = roc_auc_score(y_train.iloc[test], probabilities[:, 1])\n", 109 | " scores.append(score)\n", 110 | " print(model.number_of_trees)\n", 111 | "print(np.mean(scores))" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "0a04e569", 118 | "metadata": { 119 | "execution": { 120 | "iopub.execute_input": "2024-10-21T07:01:34.702317Z", 121 | "iopub.status.busy": "2024-10-21T07:01:34.701880Z", 122 | "iopub.status.idle": "2024-10-21T07:02:04.983918Z", 123 | "shell.execute_reply": "2024-10-21T07:02:04.982720Z" 124 | }, 125 | "papermill": { 126 | "duration": 30.294535, 127 | "end_time": "2024-10-21T07:02:04.990727", 128 | "exception": false, 129 | "start_time": "2024-10-21T07:01:34.696192", 130 | "status": "completed" 131 | }, 132 | "tags": [] 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "scores = []\n", 137 | "X_train['TARGET'] = y_train\n", 138 | "for train, test in cv.split(X_train, y_train):\n", 139 | " model = TabularPredictor(label=\"TARGET\", verbosity=0)\n", 140 | " model.fit(X_train.iloc[train])\n", 141 | " probabilities = model.predict_proba(X_train.iloc[test])\n", 142 | " score = roc_auc_score(y_train.iloc[test], probabilities.to_numpy()[:, 1])\n", 143 | " print(score)\n", 144 | " scores.append(score)\n", 145 | "print(np.mean(scores))" 146 | ] 147 | } 148 | ], 149 | "metadata": { 150 | "kaggle": { 151 | "accelerator": "none", 152 | "dataSources": [ 153 | { 154 | "databundleVersionId": 860641, 155 | "sourceId": 4986, 156 | "sourceType": "competition" 157 | } 158 | ], 159 | "dockerImageVersionId": 30786, 160 | "isGpuEnabled": false, 161 | "isInternetEnabled": true, 162 | "language": "python", 163 | "sourceType": "notebook" 164 | }, 165 | "kernelspec": { 166 | "display_name": "py311", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.11.9" 181 | }, 182 | "papermill": { 183 | "default_parameters": {}, 184 | "duration": 62.454609, 185 | "end_time": "2024-10-21T07:02:06.520206", 186 | "environment_variables": {}, 187 | "exception": null, 188 | "input_path": "__notebook__.ipynb", 189 | "output_path": "__notebook__.ipynb", 190 | "parameters": {}, 191 | "start_time": "2024-10-21T07:01:04.065597", 192 | "version": "2.6.0" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 5 197 | } 198 | -------------------------------------------------------------------------------- /python-package/examples/toy_datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import time\n", 10 | "import logging\n", 11 | "import pandas as pd\n", 12 | "from sklearn.datasets import load_breast_cancer, load_iris\n", 13 | "from sklearn.ensemble import RandomForestClassifier\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn.metrics import accuracy_score, log_loss\n", 16 | "from perpetual import PerpetualBooster" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from importlib.metadata import version\n", 26 | "\n", 27 | "print(f\"scikit-learn: {version('scikit-learn')}\")\n", 28 | "print(f\"perpetual: {version('perpetual')}\")" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "logging.basicConfig(level=logging.INFO)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n", 47 | " start = time.time()\n", 48 | " model.fit(X_train, y_train, budget=budget) if budget else model.fit(X_train, y_train)\n", 49 | " if budget:\n", 50 | " print(model.number_of_trees)\n", 51 | " duration = time.time() - start\n", 52 | " return duration, accuracy_score(y_test, model.predict(X_test)), log_loss(y_test, model.predict_proba(X_test))\n", 53 | "\n", 54 | "datasets = {\"Breast Cancer\": load_breast_cancer(return_X_y=True), \"Binary Iris\": (load_iris(return_X_y=True)[0][load_iris().target!=2], load_iris(return_X_y=True)[1][load_iris().target!=2])}\n", 55 | "results = pd.DataFrame(columns=[\"Dataset\", \"Model\", \"Budget\", \"Time\", \"Accuracy\", \"Log Loss\"])\n", 56 | "\n", 57 | "for name, (X, y) in datasets.items():\n", 58 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", 59 | " pb = PerpetualBooster(objective=\"LogLoss\", log_iterations=1, stopping_rounds=1, iteration_limit=1000)\n", 60 | " rf = RandomForestClassifier()\n", 61 | " results = pd.concat([results,\n", 62 | " pd.DataFrame([[name, \"Perpetual\", \"0.1\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1)]], columns=results.columns),\n", 63 | " #pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n", 64 | " #pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n", 65 | " #pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n", 66 | " ],\n", 67 | " ignore_index=True)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "results" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "py311", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.11.9" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /python-package/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: perpetual 2 | repo_name: perpetual 3 | repo_url: https://github.com/perpetual-ml/perpetual/ 4 | theme: 5 | name: material 6 | icon: 7 | repo: fontawesome/brands/github 8 | logo: material/forest-outline 9 | palette: 10 | # Palette toggle for light mode 11 | - scheme: default 12 | primary: deep purple 13 | toggle: 14 | icon: material/toggle-switch 15 | name: Switch to dark mode 16 | # Palette toggle for dark mode 17 | - scheme: slate 18 | primary: deep purple 19 | toggle: 20 | icon: material/toggle-switch-off-outline 21 | name: Switch to light mode 22 | 23 | markdown_extensions: 24 | - pymdownx.highlight: 25 | anchor_linenums: true 26 | line_spans: __span 27 | pygments_lang_class: true 28 | - pymdownx.inlinehilite 29 | - pymdownx.snippets 30 | - pymdownx.superfences 31 | 32 | plugins: 33 | - search 34 | - autorefs 35 | - mkdocstrings: 36 | handlers: 37 | python: 38 | options: 39 | heading_level: 2 40 | docstring_section_style: list 41 | members_order: source 42 | show_root_heading: true 43 | show_root_full_path: false 44 | separate_signature: true 45 | show_source: false 46 | show_signature_annotations: true 47 | merge_init_into_class: true 48 | -------------------------------------------------------------------------------- /python-package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "perpetual" 7 | version = "0.9.3" 8 | description = "A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization" 9 | keywords = [ 10 | "rust", 11 | "perpetual", 12 | "machine learning", 13 | "tree model", 14 | "decision tree", 15 | "gradient boosted decision tree", 16 | "gradient boosting machine" 17 | ] 18 | authors = [{ name = "Mutlu Simsek" }] 19 | dependencies = ["numpy", "typing-extensions"] 20 | requires-python = ">=3.9" 21 | classifiers = [ 22 | "Programming Language :: Rust", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Programming Language :: Python :: 3.13", 29 | ] 30 | 31 | [project.optional-dependencies] 32 | dev = ["black", "pandas", "polars", "pyarrow", "maturin", "pytest", "seaborn", "scikit-learn", "mkdocs-material", "mkdocstrings[python]", "mkdocs-autorefs", "ruff"] 33 | 34 | [tool.maturin] 35 | sdist-include = ["LICENSE", "README.md"] 36 | python-source = "python" 37 | module-name = "perpetual.perpetual" 38 | 39 | [tool.ruff] 40 | # Never enforce `E501` (line length violations). 41 | ignore = ["E501"] 42 | 43 | [tool.isort] 44 | profile = "black" 45 | -------------------------------------------------------------------------------- /python-package/python/perpetual/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from perpetual.booster import PerpetualBooster 4 | 5 | 6 | __all__ = ["PerpetualBooster"] 7 | -------------------------------------------------------------------------------- /python-package/python/perpetual/data.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Iterable, Optional, Union 3 | 4 | 5 | @dataclass 6 | class Node: 7 | """Dataclass representation of a node, this represents all of the fields present in a tree node.""" 8 | 9 | num: int 10 | weight_value: float 11 | hessian_sum: float 12 | depth: int 13 | split_value: float 14 | split_feature: Union[str, int] 15 | split_gain: float 16 | missing_node: int 17 | left_child: int 18 | right_child: int 19 | is_leaf: bool 20 | node_type: str 21 | parent_node: int 22 | generalization: Optional[float] 23 | left_cats: Optional[Iterable] 24 | right_cats: Optional[Iterable] 25 | count: int 26 | -------------------------------------------------------------------------------- /python-package/python/perpetual/serialize.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from abc import ABC, abstractmethod 5 | from ast import literal_eval 6 | from dataclasses import dataclass 7 | from typing import Dict, Generic, List, Tuple, TypeVar, Union 8 | 9 | import numpy as np 10 | import numpy.typing as npt 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | class BaseSerializer(ABC, Generic[T]): 16 | @abstractmethod 17 | def serialize(self, obj: T) -> str: 18 | """serialize method - should take an object and return a string""" 19 | 20 | @abstractmethod 21 | def deserialize(self, obj_repr: str) -> T: 22 | """deserialize method - should take a string and return original object""" 23 | 24 | 25 | Scaler = Union[int, float, str] 26 | 27 | 28 | class ScalerSerializer(BaseSerializer[Scaler]): 29 | def serialize(self, obj: Scaler) -> str: 30 | if isinstance(obj, str): 31 | obj_ = f"'{obj}'" 32 | else: 33 | obj_ = str(obj) 34 | return obj_ 35 | 36 | def deserialize(self, obj_repr: str) -> Scaler: 37 | return literal_eval(node_or_string=obj_repr) 38 | 39 | 40 | ObjectItem = Union[ 41 | List[Scaler], 42 | Dict[str, Scaler], 43 | Scaler, 44 | ] 45 | 46 | 47 | class ObjectSerializer(BaseSerializer[ObjectItem]): 48 | def serialize(self, obj: ObjectItem) -> str: 49 | return json.dumps(obj) 50 | 51 | def deserialize(self, obj_repr: str) -> ObjectItem: 52 | return json.loads(obj_repr) 53 | 54 | 55 | @dataclass 56 | class NumpyData: 57 | array: Union[List[float], List[int]] 58 | dtype: str 59 | shape: Tuple[int, ...] 60 | 61 | 62 | class NumpySerializer(BaseSerializer[npt.NDArray]): 63 | def serialize(self, obj: npt.NDArray) -> str: 64 | return json.dumps( 65 | {"array": obj.tolist(), "dtype": str(obj.dtype), "shape": obj.shape} 66 | ) 67 | 68 | def deserialize(self, obj_repr: str) -> npt.NDArray: 69 | data = NumpyData(**json.loads(obj_repr)) 70 | a = np.array(data.array, dtype=data.dtype) # type: ignore 71 | if len(data.shape) == 1: 72 | return a 73 | else: 74 | return a.reshape(data.shape) 75 | -------------------------------------------------------------------------------- /python-package/python/perpetual/types.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing_extensions import Self 3 | from typing import Any, Dict, Iterable, Protocol, Set 4 | 5 | 6 | class BoosterType(Protocol): 7 | monotone_constraints: Dict[int, int] 8 | terminate_missing_features: Set[int] 9 | number_of_trees: int 10 | base_score: float 11 | 12 | def fit( 13 | self, 14 | flat_data: np.ndarray, 15 | rows: int, 16 | cols: int, 17 | y: np.ndarray, 18 | budget: float, 19 | sample_weight: np.ndarray, 20 | parallel: bool = False, 21 | ): 22 | """Fit method""" 23 | 24 | def predict( 25 | self, 26 | flat_data: np.ndarray, 27 | rows: int, 28 | cols: int, 29 | parallel: bool = True, 30 | ) -> np.ndarray: 31 | """predict method""" 32 | 33 | def predict_proba( 34 | self, 35 | flat_data: np.ndarray, 36 | rows: int, 37 | cols: int, 38 | parallel: bool = True, 39 | ) -> np.ndarray: 40 | """predict probabilities method""" 41 | 42 | def predict_contributions( 43 | self, 44 | flat_data: np.ndarray, 45 | rows: int, 46 | cols: int, 47 | method: str, 48 | parallel: bool = True, 49 | ) -> np.ndarray: 50 | """method""" 51 | 52 | def value_partial_dependence( 53 | self, 54 | feature: int, 55 | value: float, 56 | ) -> float: 57 | """pass""" 58 | 59 | def calculate_feature_importance( 60 | self, 61 | method: str, 62 | normalize: bool, 63 | ) -> Dict[int, float]: 64 | """pass""" 65 | 66 | def text_dump(self) -> Iterable[str]: 67 | """pass""" 68 | 69 | @classmethod 70 | def load_booster(cls, path: str) -> Self: 71 | """pass""" 72 | 73 | def save_booster(self, path: str): 74 | """pass""" 75 | 76 | @classmethod 77 | def from_json(cls, json_str: str) -> Self: 78 | """pass""" 79 | 80 | def json_dump(self) -> str: 81 | """pass""" 82 | 83 | def get_params(self) -> Dict[str, Any]: 84 | """pass""" 85 | 86 | def insert_metadata(self, key: str, value: str) -> None: 87 | """pass""" 88 | 89 | def get_metadata(self, key: str) -> str: 90 | """pass""" 91 | 92 | 93 | class MultiOutputBoosterType(Protocol): 94 | monotone_constraints: Dict[int, int] 95 | terminate_missing_features: Set[int] 96 | number_of_trees: Iterable[int] 97 | base_score: Iterable[float] 98 | 99 | def fit( 100 | self, 101 | flat_data: np.ndarray, 102 | rows: int, 103 | cols: int, 104 | y: np.ndarray, 105 | budget: float, 106 | sample_weight: np.ndarray, 107 | parallel: bool = False, 108 | ): 109 | """Fit method""" 110 | 111 | def predict( 112 | self, 113 | flat_data: np.ndarray, 114 | rows: int, 115 | cols: int, 116 | parallel: bool = True, 117 | ) -> np.ndarray: 118 | """predict method""" 119 | 120 | def predict_proba( 121 | self, 122 | flat_data: np.ndarray, 123 | rows: int, 124 | cols: int, 125 | parallel: bool = True, 126 | ) -> np.ndarray: 127 | """predict probabilities method""" 128 | 129 | @classmethod 130 | def load_booster(cls, path: str) -> Self: 131 | """pass""" 132 | 133 | def save_booster(self, path: str): 134 | """pass""" 135 | 136 | @classmethod 137 | def from_json(cls, json_str: str) -> Self: 138 | """pass""" 139 | 140 | def json_dump(self) -> str: 141 | """pass""" 142 | 143 | def get_params(self) -> Dict[str, Any]: 144 | """pass""" 145 | 146 | def insert_metadata(self, key: str, value: str) -> None: 147 | """pass""" 148 | 149 | def get_metadata(self, key: str) -> str: 150 | """pass""" 151 | -------------------------------------------------------------------------------- /python-package/python/perpetual/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from typing import Dict, Iterable, List, Optional, Tuple 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def type_df(df): 10 | library_name = type(df).__module__.split(".")[0] 11 | if type(df).__name__ == "DataFrame": 12 | if library_name == "pandas": 13 | return "pandas_df" 14 | elif library_name == "polars": 15 | return "polars_df" 16 | elif library_name == "numpy": 17 | return "numpy" 18 | else: 19 | return "" 20 | 21 | 22 | def type_series(y): 23 | library_name = type(y).__module__.split(".")[0] 24 | if type(y).__name__ == "Series": 25 | if library_name == "pandas": 26 | return "pandas_series" 27 | elif library_name == "polars": 28 | return "polars_series" 29 | elif library_name == "numpy": 30 | return "numpy" 31 | else: 32 | return "" 33 | 34 | 35 | def convert_input_array(x, objective, is_target=False) -> np.ndarray: 36 | classes_ = [] 37 | 38 | if type(x).__module__.split(".")[0] == "numpy": 39 | if len(x.shape) == 2: 40 | classes_, x_, *_ = convert_input_frame(x, None, 1000) 41 | else: 42 | x_ = x 43 | elif type_series(x) == "pandas_series": 44 | x_ = x.to_numpy() 45 | elif type_series(x) == "polars_series": 46 | x_ = x.to_numpy(allow_copy=False) 47 | elif type_df(x) == "polars_df" or type_df(x) == "pandas_df": 48 | classes_, x_, *_ = convert_input_frame(x, None, 1000) 49 | else: 50 | x_ = x.to_numpy() 51 | 52 | if is_target and objective == "LogLoss" and len(x_.shape) == 1: 53 | classes_ = np.unique(x_) 54 | x_index = np.array([np.where(classes_ == i) for i in x_]) 55 | if len(classes_) > 2: 56 | x_ = np.squeeze(np.eye(len(classes_))[x_index]) 57 | 58 | if not np.issubdtype(x_.dtype, "float64"): 59 | x_ = x_.astype(dtype="float64", copy=False) 60 | 61 | if len(x_.shape) == 2: 62 | x_ = x_.ravel(order="F") 63 | 64 | return x_, classes_ 65 | 66 | 67 | def convert_input_frame( 68 | X, 69 | categorical_features, 70 | max_cat, 71 | ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: 72 | """Convert data to format needed by booster. 73 | 74 | Returns: 75 | Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping 76 | """ 77 | categorical_features_ = None 78 | if type_df(X) == "pandas_df": 79 | X_ = X.to_numpy() 80 | features_ = X.columns.to_list() 81 | if categorical_features == "auto": 82 | categorical_columns = X.select_dtypes(include=["category"]).columns.tolist() 83 | categorical_features_ = [ 84 | features_.index(c) for c in categorical_columns 85 | ] or None 86 | elif type_df(X) == "polars_df": 87 | import polars.selectors as cs 88 | 89 | try: 90 | X_ = X.to_numpy(allow_copy=False) 91 | except RuntimeError: 92 | X_ = X.to_numpy(allow_copy=True) 93 | 94 | features_ = X.columns 95 | if categorical_features == "auto": 96 | categorical_columns = X.select(cs.categorical()).columns 97 | categorical_features_ = [ 98 | features_.index(c) for c in categorical_columns 99 | ] or None 100 | else: 101 | # Assume it's a numpy array. 102 | X_ = X 103 | features_ = list(map(str, range(X_.shape[1]))) 104 | 105 | if ( 106 | categorical_features 107 | and all(isinstance(s, int) for s in categorical_features) 108 | and isinstance(categorical_features, list) 109 | ): 110 | categorical_features_ = categorical_features 111 | elif ( 112 | categorical_features 113 | and all(isinstance(s, str) for s in categorical_features) 114 | and isinstance(categorical_features, list) 115 | ): 116 | categorical_features_ = [features_.index(c) for c in categorical_features] 117 | 118 | cat_mapping = {} # key: feature_name, value: ordered category names 119 | cat_to_num = [] 120 | if categorical_features_: 121 | for i in categorical_features_: 122 | categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True) 123 | 124 | categories = list(categories) 125 | if "nan" in categories: 126 | categories.remove("nan") 127 | categories.insert(0, "nan") 128 | 129 | inversed = inversed + 1.0 130 | 131 | if len(categories) > max_cat: 132 | cat_to_num.append(i) 133 | logger.warning( 134 | f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold." 135 | ) 136 | 137 | feature_name = features_[i] 138 | cat_mapping[feature_name] = categories 139 | ind_nan = len(categories) 140 | inversed[inversed == ind_nan] = np.nan 141 | X_[:, i] = inversed 142 | 143 | categorical_features_ = [ 144 | x for x in categorical_features_ if x not in cat_to_num 145 | ] 146 | 147 | logger.info(f"Categorical features: {categorical_features_}") 148 | logger.info(f"Mapping of categories: {cat_mapping}") 149 | 150 | if not np.issubdtype(X_.dtype, "float64"): 151 | X_ = X_.astype(dtype="float64", copy=False) 152 | flat_data = X_.ravel(order="F") 153 | rows, cols = X_.shape 154 | 155 | if isinstance(categorical_features_, list): 156 | categorical_features_ = set(categorical_features_) 157 | 158 | return features_, flat_data, rows, cols, categorical_features_, cat_mapping 159 | 160 | 161 | def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]: 162 | """Convert data to format needed by booster. 163 | 164 | Returns: 165 | Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns 166 | """ 167 | if type_df(X) == "pandas_df": 168 | X_ = X.to_numpy() 169 | features_ = X.columns.to_list() 170 | elif type_df(X) == "polars_df": 171 | try: 172 | X_ = X.to_numpy(allow_copy=False) 173 | except RuntimeError: 174 | X_ = X.to_numpy(allow_copy=True) 175 | features_ = X.columns 176 | else: 177 | # Assume it's a numpy array. 178 | X_ = X 179 | features_ = list(map(str, range(X_.shape[1]))) 180 | 181 | if cat_mapping: 182 | for feature_name, categories in cat_mapping.items(): 183 | feature_index = features_.index(feature_name) 184 | cats = categories.copy() 185 | cats.remove("nan") 186 | x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str)) 187 | x_enc = x_enc + 1.0 188 | ind_nan = len(categories) 189 | x_enc[x_enc == ind_nan] = np.nan 190 | X_[:, feature_index] = x_enc 191 | 192 | if not np.issubdtype(X_.dtype, "float64"): 193 | X_ = X_.astype(dtype="float64", copy=False) 194 | flat_data = X_.ravel(order="F") 195 | rows, cols = X_.shape 196 | 197 | return features_, flat_data, rows, cols 198 | 199 | 200 | CONTRIBUTION_METHODS = { 201 | "weight": "Weight", 202 | "Weight": "Weight", 203 | "average": "Average", 204 | "Average": "Average", 205 | "branch-difference": "BranchDifference", 206 | "branchdifference": "BranchDifference", 207 | "BranchDifference": "BranchDifference", 208 | "midpoint-difference": "MidpointDifference", 209 | "midpointdifference": "MidpointDifference", 210 | "MidpointDifference": "MidpointDifference", 211 | "mode-difference": "ModeDifference", 212 | "modedifference": "ModeDifference", 213 | "ModeDifference": "ModeDifference", 214 | "ProbabilityChange": "ProbabilityChange", 215 | "probabilitychange": "ProbabilityChange", 216 | "probability-change": "ProbabilityChange", 217 | } 218 | -------------------------------------------------------------------------------- /python-package/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod booster; 2 | mod multi_output; 3 | mod utils; 4 | 5 | use crate::booster::PerpetualBooster; 6 | use crate::multi_output::MultiOutputBooster; 7 | use crate::utils::percentiles; 8 | use crate::utils::print_matrix; 9 | use pyo3::prelude::*; 10 | 11 | #[pymodule] 12 | fn perpetual(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { 13 | pyo3_log::init(); 14 | 15 | m.add_function(wrap_pyfunction!(print_matrix, m)?)?; 16 | m.add_function(wrap_pyfunction!(percentiles, m)?)?; 17 | 18 | m.add_class::()?; 19 | m.add_class::()?; 20 | 21 | Ok(()) 22 | } 23 | -------------------------------------------------------------------------------- /python-package/src/utils.rs: -------------------------------------------------------------------------------- 1 | use numpy::IntoPyArray; 2 | use numpy::PyArray1; 3 | use numpy::PyReadonlyArray1; 4 | use perpetual_rs::constraints::{Constraint, ConstraintMap}; 5 | use perpetual_rs::data::Matrix; 6 | use perpetual_rs::utils::percentiles as crate_percentiles; 7 | use pyo3::exceptions::PyValueError; 8 | use pyo3::prelude::*; 9 | use std::collections::HashMap; 10 | 11 | pub fn int_map_to_constraint_map(int_map: HashMap) -> PyResult { 12 | let mut constraints: ConstraintMap = HashMap::new(); 13 | for (f, c) in int_map.iter() { 14 | let c_ = match c { 15 | -1 => Ok(Constraint::Negative), 16 | 1 => Ok(Constraint::Positive), 17 | 0 => Ok(Constraint::Unconstrained), 18 | _ => Err(PyValueError::new_err(format!( 19 | "Valid monotone constraints are -1, 1 or 0, but '{}' was provided for feature number {}.", 20 | c, f 21 | ))), 22 | }?; 23 | constraints.insert(*f, c_); 24 | } 25 | Ok(constraints) 26 | } 27 | 28 | pub fn to_value_error(value: Result) -> Result { 29 | match value { 30 | Ok(v) => Ok(v), 31 | Err(e) => Err(PyValueError::new_err(e.to_string())), 32 | } 33 | } 34 | #[pyfunction] 35 | pub fn print_matrix(x: PyReadonlyArray1, rows: usize, cols: usize) -> PyResult<()> { 36 | let m = Matrix::new(x.as_slice()?, rows, cols); 37 | println!("{}", m); 38 | Ok(()) 39 | } 40 | 41 | #[pyfunction] 42 | pub fn percentiles<'py>( 43 | py: Python<'py>, 44 | v: PyReadonlyArray1, 45 | sample_weight: PyReadonlyArray1, 46 | percentiles: PyReadonlyArray1, 47 | ) -> PyResult>> { 48 | let v_ = v.as_slice()?; 49 | let sample_weight_ = sample_weight.as_slice()?; 50 | let percentiles_ = percentiles.as_slice()?; 51 | let p = crate_percentiles(v_, sample_weight_, percentiles_); 52 | Ok(p.into_pyarray_bound(py)) 53 | } 54 | -------------------------------------------------------------------------------- /python-package/tests/test_multi_output.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from perpetual import PerpetualBooster 4 | 5 | 6 | def test_multi_output(): 7 | X = pd.read_csv("../resources/cover_types_train.csv", index_col=False) 8 | X = X.sample(n=10000, random_state=0) 9 | y = np.array(X.pop("Cover_Type")) 10 | X_test = pd.read_csv("../resources/cover_types_test.csv", index_col=False) 11 | y_test = np.array(X_test.pop("Cover_Type")) 12 | model = PerpetualBooster(iteration_limit=40, memory_limit=1.0) 13 | model.fit(X, y) 14 | pred_test = model.predict(X_test) 15 | proba_test = model.predict_proba(X_test) 16 | log_odds_test = model.predict_log_proba(X_test) 17 | assert not np.isnan(pred_test).any() 18 | assert not np.isnan(proba_test).any() 19 | assert not np.isnan(log_odds_test).any() 20 | assert np.allclose(np.sum(proba_test, axis=1), np.ones(proba_test.shape[0])) 21 | assert np.allclose(proba_test.shape, (len(X_test), len(np.unique(y_test)))) 22 | assert set(y_test) == set(pred_test) 23 | -------------------------------------------------------------------------------- /python-package/tests/test_save_load.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | import perpetual 6 | from perpetual import PerpetualBooster 7 | 8 | 9 | def X_y_so(): 10 | df = pd.read_csv("../resources/titanic.csv") 11 | X = df.select_dtypes("number").drop(columns="survived").reset_index(drop=True) 12 | y = df["survived"] 13 | return X, y 14 | 15 | 16 | def X_y_mo(): 17 | X = pd.read_csv("../resources/cover_types_train.csv", index_col=False) 18 | X = X.sample(n=1000, random_state=0) 19 | X.dropna(inplace=True) 20 | X = X.loc[:, (X != X.iloc[0]).any()] 21 | y = X.pop("Cover_Type") 22 | return X, y 23 | 24 | 25 | def pickle_booster(model: PerpetualBooster, path: str) -> None: 26 | with open(path, "wb") as file: 27 | pickle.dump(model, file) 28 | 29 | 30 | def unpickle_booster(path: str) -> PerpetualBooster: 31 | with open(path, "rb") as file: 32 | return pickle.load(file) 33 | 34 | 35 | def save_booster(model: PerpetualBooster, path: str) -> None: 36 | model.save_booster(path) 37 | 38 | 39 | def load_booster(path: str) -> PerpetualBooster: 40 | return PerpetualBooster.load_booster(path) 41 | 42 | 43 | @pytest.mark.parametrize("X_y", [X_y_mo, X_y_so]) 44 | @pytest.mark.parametrize( 45 | "load_func,save_func", 46 | [(unpickle_booster, pickle_booster), (load_booster, save_booster)], 47 | ) 48 | class TestSaveLoadFunctions: 49 | def test_booster_metadata(self, X_y, tmp_path, load_func, save_func): 50 | f64_model_path = tmp_path / "modelf64_sl.json" 51 | X, y = X_y() 52 | model = PerpetualBooster( 53 | objective="SquaredLoss", iteration_limit=10, memory_limit=1.0 54 | ) 55 | save_func(model, f64_model_path) 56 | model.json_dump() 57 | model.fit(X, y) 58 | preds = model.predict(X) 59 | save_func(model, f64_model_path) 60 | model.insert_metadata("test-info", "some-info") 61 | assert model.get_metadata("test-info") == "some-info" 62 | save_func(model, f64_model_path) 63 | 64 | loaded = load_func(f64_model_path) 65 | assert loaded.get_metadata("test-info") == "some-info" 66 | 67 | with pytest.raises(KeyError): 68 | loaded.get_metadata("No-key") 69 | 70 | loaded_dict = loaded.__dict__ 71 | model_dict = model.__dict__ 72 | 73 | assert sorted(loaded_dict.keys()) == sorted(model_dict.keys()) 74 | for k, v in loaded_dict.items(): 75 | c_v = model_dict[k] 76 | if isinstance(v, float): 77 | if np.isnan(v): 78 | assert np.isnan(c_v) 79 | else: 80 | assert np.allclose(v, c_v) 81 | elif isinstance(v, perpetual.booster.CratePerpetualBooster) or isinstance( 82 | v, perpetual.booster.CrateMultiOutputBooster 83 | ): 84 | assert isinstance( 85 | c_v, perpetual.booster.CratePerpetualBooster 86 | ) or isinstance(v, perpetual.booster.CrateMultiOutputBooster) 87 | else: 88 | print("else_block:") 89 | print(k) 90 | print(v) 91 | print(c_v) 92 | assert v == c_v, k 93 | loaded_preds = loaded.predict(X) 94 | assert np.allclose(preds, loaded_preds) 95 | 96 | def test_booster_saving(self, X_y, tmp_path, load_func, save_func): 97 | # SquaredLoss 98 | f64_model_path = tmp_path / "modelf64_sl.json" 99 | X, y = X_y() 100 | X = X 101 | model = PerpetualBooster( 102 | objective="SquaredLoss", iteration_limit=10, memory_limit=1.0 103 | ) 104 | model.fit(X, y) 105 | preds = model.predict(X) 106 | save_func(model, f64_model_path) 107 | model_loaded = load_func(f64_model_path) 108 | assert all(preds == model_loaded.predict(X)) 109 | 110 | # LogLoss 111 | f64_model_path = tmp_path / "modelf64_ll.json" 112 | X, y = X_y() 113 | model = PerpetualBooster( 114 | objective="LogLoss", iteration_limit=10, memory_limit=1.0 115 | ) 116 | model.fit(X, y) 117 | preds = model.predict(X) 118 | save_func(model, f64_model_path) 119 | model_loaded = load_func(f64_model_path) 120 | assert model_loaded.feature_names_in_ == model.feature_names_in_ 121 | assert model_loaded.feature_names_in_ == X.columns.to_list() 122 | assert all(preds == model_loaded.predict(X)) 123 | 124 | def test_booster_saving_with_monotone_constraints( 125 | self, X_y, tmp_path, load_func, save_func 126 | ): 127 | # squared loss 128 | f64_model_path = tmp_path / "modelf64_sl.json" 129 | X, y = X_y() 130 | 131 | def calculate_monotonicity(x, y): 132 | correlation = x.corr(y) 133 | if np.isnan(correlation): 134 | return 0 # Or another appropriate default value 135 | else: 136 | return int(np.sign(correlation)) 137 | 138 | mono_ = X.apply(lambda x: calculate_monotonicity(x, y)).to_dict() 139 | 140 | model = PerpetualBooster( 141 | objective="SquaredLoss", 142 | monotone_constraints=mono_, 143 | iteration_limit=10, 144 | memory_limit=1.0, 145 | ) 146 | model.fit(X, y) 147 | preds = model.predict(X) 148 | save_func(model, f64_model_path) 149 | model_loaded = load_func(f64_model_path) 150 | assert model_loaded.feature_names_in_ == model.feature_names_in_ 151 | assert model_loaded.feature_names_in_ == X.columns.to_list() 152 | assert all(preds == model_loaded.predict(X)) 153 | assert all( 154 | [ 155 | model.monotone_constraints[ft] == model_loaded.monotone_constraints[ft] 156 | for ft in model_loaded.feature_names_in_ 157 | ] 158 | ) 159 | assert all( 160 | [ 161 | model.monotone_constraints[ft] == model_loaded.monotone_constraints[ft] 162 | for ft in model.feature_names_in_ 163 | ] 164 | ) 165 | assert all( 166 | [ 167 | model.monotone_constraints[ft] == model_loaded.monotone_constraints[ft] 168 | for ft in mono_.keys() 169 | ] 170 | ) 171 | 172 | # LogLoss 173 | f64_model_path = tmp_path / "modelf64_ll.json" 174 | X, y = X_y() 175 | X = X 176 | model = PerpetualBooster( 177 | objective="LogLoss", 178 | monotone_constraints=mono_, 179 | iteration_limit=10, 180 | memory_limit=1.0, 181 | ) 182 | model.fit(X, y) 183 | preds = model.predict(X) 184 | save_func(model, f64_model_path) 185 | model_loaded = load_func(f64_model_path) 186 | assert all(preds == model_loaded.predict(X)) 187 | -------------------------------------------------------------------------------- /python-package/tests/test_serialize.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from perpetual.serialize import ( 7 | NumpySerializer, 8 | ObjectItem, 9 | ObjectSerializer, 10 | Scaler, 11 | ScalerSerializer, 12 | ) 13 | 14 | scaler_values = [ 15 | 1, 16 | 1.0, 17 | 1.00101, 18 | "a string", 19 | True, 20 | False, 21 | None, 22 | ] 23 | 24 | 25 | @pytest.mark.parametrize("value", scaler_values) 26 | def test_scaler(value: Scaler): 27 | serializer = ScalerSerializer() 28 | r = serializer.serialize(value) 29 | assert isinstance(r, str) 30 | assert value == serializer.deserialize(r) 31 | 32 | 33 | object_values = [ 34 | [1, 2, 3], 35 | [1.0, 4.0], 36 | ["a", "b", "c"], 37 | {"a": 1.0, "b": 2.0}, 38 | {"a": "test", "b": "what"}, 39 | *scaler_values, 40 | ] 41 | 42 | 43 | @pytest.mark.parametrize("value", object_values) 44 | def test_object(value: ObjectItem): 45 | serializer = ObjectSerializer() 46 | r = serializer.serialize(value) 47 | assert isinstance(r, str) 48 | assert value == serializer.deserialize(r) 49 | 50 | 51 | numpy_values = [ 52 | np.array([1.0, 2.23]), 53 | np.array([1, 2, 3, 4, 5, 6]).reshape((2, 3)), 54 | np.array([1, 2, 3, 4, 5, 6], dtype="int").reshape((2, 3)), 55 | ] 56 | 57 | 58 | @pytest.mark.parametrize("value", numpy_values) 59 | def test_numpy(value: np.ndarray): 60 | serializer = NumpySerializer() 61 | r = serializer.serialize(value) 62 | assert isinstance(r, str) 63 | assert np.array_equal(value, serializer.deserialize(r)) 64 | -------------------------------------------------------------------------------- /resources/perp_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/perpetual-ml/perpetual/581262534205b6bc8fd85694359a33c8983e8918/resources/perp_logo.png -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | nightly 2 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 120 -------------------------------------------------------------------------------- /scripts/make_resources.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.datasets import fetch_california_housing, fetch_covtype, fetch_openml 5 | from perpetual.utils import convert_input_frame, transform_input_frame 6 | 7 | if __name__ == "__main__": 8 | df = sns.load_dataset("titanic") 9 | df.to_csv("resources/titanic.csv", index=False) 10 | 11 | X = df.select_dtypes("number").drop(columns=["survived"]).astype(float) 12 | y = df["survived"].astype(float) 13 | 14 | pd.Series(X.fillna(0).to_numpy().ravel(order="F")).to_csv( 15 | "resources/contiguous_no_missing.csv", 16 | index=False, 17 | header=False, 18 | ) 19 | 20 | pd.Series(X.to_numpy().ravel(order="F")).to_csv( 21 | "resources/contiguous_with_missing.csv", 22 | index=False, 23 | header=False, 24 | ) 25 | 26 | y.to_csv( 27 | "resources/performance.csv", 28 | index=False, 29 | header=False, 30 | ) 31 | 32 | X.fare.to_csv( 33 | "resources/performance-fare.csv", 34 | index=False, 35 | header=False, 36 | ) 37 | 38 | dfb = df.sample( 39 | 100_000, 40 | random_state=0, 41 | replace=True, 42 | ).reset_index(drop=True) 43 | 44 | Xb = dfb.select_dtypes("number").drop(columns=["survived"]).astype(float) 45 | yb = dfb["survived"].astype(float) 46 | 47 | pd.Series(Xb.fillna(0).to_numpy().ravel(order="F")).to_csv( 48 | "resources/contiguous_no_missing_100k_samp_seed0.csv", 49 | index=False, 50 | header=False, 51 | ) 52 | 53 | yb.to_csv( 54 | "resources/performance_100k_samp_seed0.csv", 55 | index=False, 56 | header=False, 57 | ) 58 | 59 | data = fetch_california_housing(as_frame=True) 60 | data_train, data_test = train_test_split(data.frame, test_size=0.2, random_state=42) 61 | data_train.to_csv("resources/cal_housing_train.csv", index=False) 62 | data_test.to_csv("resources/cal_housing_test.csv", index=False) 63 | 64 | data = fetch_covtype(as_frame=True) 65 | data_train, data_test = train_test_split(data.frame, test_size=0.2, random_state=42) 66 | data_train.to_csv("resources/cover_types_train.csv", index=False) 67 | data_test.to_csv("resources/cover_types_test.csv", index=False) 68 | 69 | 70 | 71 | 72 | 73 | X = df.drop(columns=["survived"]) 74 | y = df["survived"] 75 | 76 | X["sex"] = pd.get_dummies(X["sex"], drop_first=True, dtype=float).to_numpy() 77 | X["adult_male"] = pd.get_dummies(X["adult_male"], drop_first=True, dtype=float).to_numpy() 78 | X.drop(columns=["alive"], inplace=True) 79 | X["alone"] = pd.get_dummies(X["alone"], drop_first=True, dtype=float).to_numpy() 80 | cols = ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town'] 81 | X[cols] = X[cols].astype('category') 82 | 83 | data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 84 | 85 | features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto", 1000) 86 | features_, titanic_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping) 87 | 88 | data_test.to_csv("resources/titanic_test_df.csv", index=False) 89 | 90 | pd.Series(titanic_train_flat).to_csv("resources/titanic_train_flat.csv", index=False, header=False) 91 | pd.Series(titanic_test_flat).to_csv("resources/titanic_test_flat.csv", index=False, header=False) 92 | pd.Series(y_train).to_csv("resources/titanic_train_y.csv", index=False, header=False) 93 | pd.Series(y_test).to_csv("resources/titanic_test_y.csv", index=False, header=False) 94 | 95 | 96 | # https://www.openml.org/search?type=data&id=546&sort=runs&status=active 97 | df = fetch_openml(data_id=546) 98 | X = df.data 99 | y = df.target 100 | features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto", 1000) 101 | pd.Series(sensory_flat).to_csv("resources/sensory_flat.csv", index=False, header=False) 102 | pd.Series(y).to_csv("resources/sensory_y.csv", index=False, header=False) 103 | -------------------------------------------------------------------------------- /scripts/remove-optional-deps.py: -------------------------------------------------------------------------------- 1 | # Building with polars is sooo slow. 2 | # It's only there for the example, so let's remove it 3 | # in the regular build process. 4 | # Requires toml package 5 | import shutil 6 | 7 | import toml 8 | 9 | ct = toml.load("Cargo.toml") 10 | 11 | del ct["dev-dependencies"] 12 | del ct["bench"] 13 | 14 | with open("Cargo.toml", "w") as file: 15 | toml.dump(ct, file) 16 | 17 | # Also delete the rust example. 18 | shutil.rmtree("examples") 19 | -------------------------------------------------------------------------------- /scripts/run-python-tests.ps1: -------------------------------------------------------------------------------- 1 | Set-Location python-package 2 | python -m black python/perpetual/ 3 | python -m black tests/ 4 | python -m black examples/ 5 | maturin develop --release 6 | pytest . 7 | Set-Location .. -------------------------------------------------------------------------------- /scripts/run-python-tests.sh: -------------------------------------------------------------------------------- 1 | cd python-package 2 | python -m black python/perpetual/ 3 | python -m black tests/ 4 | python -m black examples/ 5 | maturin develop --release 6 | pytest . 7 | cd .. -------------------------------------------------------------------------------- /scripts/run-single-python-test.ps1: -------------------------------------------------------------------------------- 1 | Set-Location python-package 2 | python -m black python/perpetual/ 3 | python -m black tests/ 4 | python -m black examples/ 5 | maturin develop --release 6 | pytest tests/test_booster.py::test_predict_nodes -s 7 | Set-Location .. -------------------------------------------------------------------------------- /scripts/uv_script.ps1: -------------------------------------------------------------------------------- 1 | Set-Location python-package 2 | uv sync 3 | .venv\Scripts\activate 4 | uv pip install pip 5 | uv pip install -r pyproject.toml --extra dev 6 | Set-Location .. 7 | -------------------------------------------------------------------------------- /scripts/uv_script.sh: -------------------------------------------------------------------------------- 1 | cd python-package 2 | uv sync 3 | source .venv\Scripts\activate 4 | uv pip install pip 5 | uv pip install -r pyproject.toml --extra dev 6 | cd .. 7 | -------------------------------------------------------------------------------- /src/bin.rs: -------------------------------------------------------------------------------- 1 | use std::{cell::UnsafeCell, cmp::Ordering}; 2 | 3 | use crate::data::FloatData; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | 7 | /// Struct to hold the information of a given bin. 8 | #[derive(Debug, Deserialize, Serialize, Clone)] 9 | pub struct Bin { 10 | pub num: u16, 11 | pub cut_value: f64, 12 | pub g_folded: [f32; 5], 13 | pub h_folded: Option<[f32; 5]>, 14 | pub counts: [usize; 5], 15 | } 16 | 17 | impl Bin { 18 | pub fn empty_const_hess(num: u16, cut_value: f64) -> Self { 19 | Bin { 20 | num, 21 | cut_value, 22 | g_folded: [f32::ZERO; 5], 23 | h_folded: None, 24 | counts: [0; 5], 25 | } 26 | } 27 | pub fn empty(num: u16, cut_value: f64) -> Self { 28 | Bin { 29 | num, 30 | cut_value, 31 | g_folded: [f32::ZERO; 5], 32 | h_folded: Some([f32::ZERO; 5]), 33 | counts: [0; 5], 34 | } 35 | } 36 | 37 | pub fn from_parent_child(root_bin: *mut Bin, child_bin: *mut Bin, update_bin: *mut Bin) { 38 | let rb = unsafe { root_bin.as_ref().unwrap() }; 39 | let cb = unsafe { child_bin.as_ref().unwrap() }; 40 | let ub = unsafe { update_bin.as_mut().unwrap() }; 41 | for ((z, a), b) in ub.g_folded.iter_mut().zip(rb.g_folded).zip(cb.g_folded) { 42 | *z = a - b; 43 | } 44 | for ((z, a), b) in ub.counts.iter_mut().zip(rb.counts).zip(cb.counts) { 45 | *z = a - b; 46 | } 47 | 48 | match rb.h_folded { 49 | Some(_h_folded) => { 50 | let h_f_iter = ub.h_folded.as_mut().unwrap().iter_mut(); 51 | for ((zval, aval), bval) in h_f_iter.zip(rb.h_folded.unwrap()).zip(cb.h_folded.unwrap()) { 52 | *zval = aval - bval; 53 | } 54 | } 55 | None => { 56 | ub.h_folded = None; 57 | } 58 | }; 59 | } 60 | 61 | pub fn from_parent_two_children( 62 | root_bin: *mut Bin, 63 | first_bin: *mut Bin, 64 | second_bin: *mut Bin, 65 | update_bin: *mut Bin, 66 | ) { 67 | let rb = unsafe { root_bin.as_ref().unwrap() }; 68 | let fb = unsafe { first_bin.as_ref().unwrap() }; 69 | let sb = unsafe { second_bin.as_ref().unwrap() }; 70 | let ub = unsafe { update_bin.as_mut().unwrap() }; 71 | for (((z, a), b), c) in ub 72 | .g_folded 73 | .iter_mut() 74 | .zip(rb.g_folded) 75 | .zip(fb.g_folded) 76 | .zip(sb.g_folded) 77 | { 78 | *z = a - b - c; 79 | } 80 | for (((z, a), b), c) in ub.counts.iter_mut().zip(rb.counts).zip(fb.counts).zip(sb.counts) { 81 | *z = a - b - c; 82 | } 83 | 84 | match rb.h_folded { 85 | Some(_h_folded) => { 86 | let h_f_iter = ub.h_folded.as_mut().unwrap().iter_mut(); 87 | for (((z, a), b), c) in h_f_iter 88 | .zip(rb.h_folded.unwrap()) 89 | .zip(fb.h_folded.unwrap()) 90 | .zip(sb.h_folded.unwrap()) 91 | { 92 | *z = a - b - c; 93 | } 94 | } 95 | None => { 96 | ub.h_folded = None; 97 | } 98 | }; 99 | } 100 | } 101 | 102 | pub fn sort_cat_bins_by_num(histogram: &mut [&UnsafeCell]) { 103 | unsafe { 104 | histogram.sort_unstable_by_key(|bin| bin.get().as_ref().unwrap().num); 105 | } 106 | } 107 | 108 | pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell], is_const_hess: bool) { 109 | unsafe { 110 | if is_const_hess { 111 | histogram.sort_unstable_by(|bin1, bin2| { 112 | let b1 = bin1.get().as_ref().unwrap(); 113 | let b2 = bin2.get().as_ref().unwrap(); 114 | if b1.num == 0 { 115 | return Ordering::Less; 116 | } else if b2.num == 0 { 117 | return Ordering::Greater; 118 | } 119 | let div1: f32 = b1.g_folded.iter().sum::() / b1.counts.iter().sum::() as f32; 120 | let div2: f32 = b2.g_folded.iter().sum::() / b2.counts.iter().sum::() as f32; 121 | div2.partial_cmp(&div1).unwrap_or(Ordering::Less) 122 | }); 123 | } else { 124 | histogram.sort_unstable_by(|bin1, bin2| { 125 | let b1 = bin1.get().as_ref().unwrap(); 126 | let b2 = bin2.get().as_ref().unwrap(); 127 | if b1.num == 0 { 128 | return Ordering::Less; 129 | } else if b2.num == 0 { 130 | return Ordering::Greater; 131 | } 132 | let div1: f32 = b1.g_folded.iter().sum::() / b1.h_folded.unwrap().iter().sum::(); 133 | let div2: f32 = b2.g_folded.iter().sum::() / b2.h_folded.unwrap().iter().sum::(); 134 | div2.partial_cmp(&div1).unwrap_or(Ordering::Less) 135 | }); 136 | } 137 | } 138 | } 139 | 140 | #[cfg(test)] 141 | mod tests { 142 | use super::*; 143 | 144 | #[test] 145 | fn test_bin() { 146 | let mut root_bin = Bin::empty_const_hess(0, 0.0); 147 | root_bin.counts = [10, 10, 10, 10, 10]; 148 | let mut child_bin = Bin::empty_const_hess(1, 0.0); 149 | child_bin.counts = [9, 8, 7, 6, 5]; 150 | let mut update_bin = Bin::empty_const_hess(2, 0.0); 151 | Bin::from_parent_child( 152 | &mut root_bin as *mut Bin, 153 | &mut child_bin as *mut Bin, 154 | &mut update_bin as *mut Bin, 155 | ); 156 | assert!(update_bin.counts == [1, 2, 3, 4, 5]); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/binning.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | use crate::data::{FloatData, JaggedMatrix, Matrix}; 4 | use crate::errors::PerpetualError; 5 | use crate::utils::{is_missing, map_bin, percentiles}; 6 | 7 | /// If there are fewer unique values than their are 8 | /// percentiles, just return the unique values of the 9 | /// vectors. 10 | /// 11 | /// * `v` - A numeric slice to calculate percentiles for. 12 | /// * `sample_weight` - Instance weights for each row in the data. 13 | fn percentiles_or_value(v: &[T], sample_weight: &[T], pcts: &[T]) -> Vec 14 | where 15 | T: FloatData, 16 | { 17 | let mut v_u = v.to_owned(); 18 | v_u.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); 19 | v_u.dedup(); 20 | if v_u.len() <= pcts.len() + 1 { 21 | v_u 22 | } else { 23 | percentiles(v, sample_weight, pcts) 24 | } 25 | } 26 | 27 | // We want to be able to bin our dataset into discrete buckets. 28 | // First we will calculate percentiles and the number of unique values 29 | // for each feature. 30 | // Then we will bucket them into bins from 0 to N + 1 where N is the number 31 | // of unique bin values created from the percentiles, and the very last 32 | // bin is missing values. 33 | // For now, we will just use usize, although, it would be good to see if 34 | // we can use something smaller, u8 for instance. 35 | // If we generated these cuts: 36 | // [0.0, 7.8958, 14.4542, 31.0, 512.3292, inf] 37 | // We would have a number with bins 0 (missing), 1 [MIN, 0.0), 2 (0.0, 7], 3 [], 4, 5 38 | // a split that is [feature < 5] would translate to [feature < 31.0 ] 39 | #[derive(Debug)] 40 | pub struct BinnedData { 41 | pub binned_data: Vec, 42 | pub cuts: JaggedMatrix, 43 | pub nunique: Vec, 44 | } 45 | 46 | /// Convert a matrix of data, into a binned matrix. 47 | /// 48 | /// * `data` - Numeric data to be binned. 49 | /// * `cuts` - A slice of Vectors, where the vectors are the corresponding 50 | /// cut values for each of the columns. 51 | fn bin_matrix_from_cuts>(data: &Matrix, cuts: &JaggedMatrix, missing: &T) -> Vec { 52 | // loop through the matrix, binning the data. 53 | // We will determine the column we are in, by 54 | // using the modulo operator, on the record value. 55 | data.data 56 | .iter() 57 | .enumerate() 58 | .map(|(i, v)| { 59 | let col = i / data.rows; 60 | // This will always be smaller than u16::MAX so we 61 | // are good to just unwrap here. 62 | map_bin(cuts.get_col(col), v, missing).unwrap() 63 | }) 64 | .collect() 65 | } 66 | 67 | /// Bin a numeric matrix. 68 | /// 69 | /// * `data` - A numeric matrix, of data to be binned. 70 | /// * `sample_weight` - Instance weights for each row of the data. 71 | /// * `nbins` - The number of bins each column should be binned into. 72 | /// * `missing` - Float value to consider as missing. 73 | pub fn bin_matrix( 74 | data: &Matrix, 75 | sample_weight: Option<&[f64]>, 76 | nbins: u16, 77 | missing: f64, 78 | cat_index: Option<&HashSet>, 79 | ) -> Result, PerpetualError> { 80 | let mut pcts = Vec::new(); 81 | let nbins_ = f64::from_u16(nbins); 82 | for i in 0..nbins { 83 | let v = f64::from_u16(i) / nbins_; 84 | pcts.push(v); 85 | } 86 | 87 | let s_w = vec![1.0; data.rows]; 88 | let weight = match sample_weight { 89 | Some(sample_weight) => sample_weight, 90 | None => &s_w, 91 | }; 92 | 93 | let to_remove = match cat_index { 94 | Some(cat_index) => HashSet::from_iter(cat_index), 95 | None => HashSet::new(), 96 | }; 97 | let mut num_index: Vec = (0..data.cols).collect(); 98 | num_index.retain(|e| !to_remove.contains(&(*e))); 99 | let num_index_set: HashSet = HashSet::from_iter(num_index); 100 | 101 | // First we need to generate the bins for each of the columns. 102 | // We will loop through all of the columns, and generate the cuts. 103 | let mut cuts = JaggedMatrix::new(); 104 | let mut nunique = Vec::new(); 105 | for i in 0..data.cols { 106 | let (no_miss, w): (Vec, Vec) = data 107 | .get_col(i) 108 | .iter() 109 | .zip(weight.iter()) 110 | // It is unrecoverable if they have provided missing values in 111 | // the data other than the specificized missing. 112 | .filter(|(v, _)| !is_missing(v, &missing)) 113 | .unzip(); 114 | assert_eq!(no_miss.len(), w.len()); 115 | 116 | if num_index_set.contains(&i) { 117 | let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts); 118 | col_cuts.push(f64::MAX); 119 | col_cuts.dedup(); 120 | // if col_cuts.len() < 2 { 121 | // return Err(PerpetualError::NoVariance(i)); 122 | // } 123 | // There will be one less bins, then there are cuts. 124 | // The first value will be for missing. 125 | nunique.push(col_cuts.len()); 126 | let l = col_cuts.len(); 127 | cuts.data.extend(col_cuts); 128 | let e = match cuts.ends.last() { 129 | Some(v) => v + l, 130 | None => l, 131 | }; 132 | cuts.ends.push(e); 133 | } else { 134 | // There will be number of bins as many as number of categories. Number of bins for categorical features is not limited currently. 135 | let col_categories: HashSet = HashSet::from_iter(no_miss.iter().map(|&e| e as u16)); 136 | let mut col_cuts: Vec = col_categories.iter().map(|&e| e as f64).collect(); 137 | col_cuts.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); 138 | col_cuts.push(f64::MAX); 139 | nunique.push(col_cuts.len()); 140 | let l = col_cuts.len(); 141 | cuts.data.extend(col_cuts); 142 | let e = match cuts.ends.last() { 143 | Some(v) => v + l, 144 | None => l, 145 | }; 146 | cuts.ends.push(e); 147 | } 148 | } 149 | 150 | cuts.cols = cuts.ends.len(); 151 | cuts.n_records = cuts.ends.iter().sum(); 152 | 153 | let binned_data = bin_matrix_from_cuts(data, &cuts, &missing); 154 | 155 | Ok(BinnedData { 156 | binned_data, 157 | cuts, 158 | nunique, 159 | }) 160 | } 161 | 162 | #[cfg(test)] 163 | mod tests { 164 | use super::*; 165 | use std::fs; 166 | #[test] 167 | fn test_bin_data() { 168 | let file = 169 | fs::read_to_string("resources/contiguous_no_missing.csv").expect("Something went wrong reading the file"); 170 | let data_vec: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); 171 | let data = Matrix::new(&data_vec, 891, 5); 172 | let b = bin_matrix(&data, None, 10, f64::NAN, None).unwrap(); 173 | let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); 174 | for column in 0..data.cols { 175 | let mut b_compare = 1; 176 | for cuts in b.cuts.get_col(column).windows(2) { 177 | let c1 = cuts[0]; 178 | let c2 = cuts[1]; 179 | let mut n_v = 0; 180 | let mut n_b = 0; 181 | for (bin, value) in bdata.get_col(column).iter().zip(data.get_col(column)) { 182 | if *bin == b_compare { 183 | n_b += 1; 184 | } 185 | if (c1 <= *value) && (*value < c2) { 186 | n_v += 1; 187 | } 188 | } 189 | assert_eq!(n_v, n_b); 190 | b_compare += 1; 191 | } 192 | } 193 | println!("{:?}", b); 194 | } 195 | 196 | #[test] 197 | fn test_bin_data_categorical() { 198 | let file = 199 | fs::read_to_string("resources/titanic_train_flat.csv").expect("Something went wrong reading the file"); 200 | let n_rows = 712; 201 | let n_columns = 13; 202 | let n_lines = n_columns * n_rows; 203 | let data_vec: Vec = file 204 | .lines() 205 | .take(n_lines) 206 | .map(|x| x.trim().parse::().unwrap_or(f64::NAN)) 207 | .collect(); 208 | let data = Matrix::new(&data_vec, n_rows, n_columns); 209 | let cat_index = HashSet::from([0, 3, 4, 6, 7, 8, 10, 11]); 210 | 211 | let b = bin_matrix(&data, None, 256, f64::NAN, Some(&cat_index)).unwrap(); 212 | let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); 213 | 214 | println!("{:?}", b.cuts); 215 | println!("{:?}", b.nunique); 216 | 217 | for column in 0..data.cols { 218 | let mut b_compare = 1; 219 | for cuts in b.cuts.get_col(column).windows(2) { 220 | let c1 = cuts[0]; 221 | let c2 = cuts[1]; 222 | let mut n_v = 0; 223 | let mut n_b = 0; 224 | for (bin, value) in bdata.get_col(column).iter().zip(data.get_col(column)) { 225 | if *bin == b_compare { 226 | n_b += 1; 227 | } 228 | if (c1 <= *value) && (*value < c2) { 229 | n_v += 1; 230 | } 231 | } 232 | assert_eq!(n_v, n_b); 233 | b_compare += 1; 234 | } 235 | } 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /src/booster/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod booster; 2 | pub mod multi_output; 3 | pub mod predict; 4 | pub mod setters; 5 | -------------------------------------------------------------------------------- /src/booster/setters.rs: -------------------------------------------------------------------------------- 1 | use super::booster::MissingNodeTreatment; 2 | use crate::{constraints::ConstraintMap, objective_functions::Objective, PerpetualBooster}; 3 | use std::collections::HashSet; 4 | 5 | impl PerpetualBooster { 6 | // Set methods for paramters 7 | 8 | /// Set the objective on the booster. 9 | /// * `objective` - The objective type of the booster. 10 | pub fn set_objective(mut self, objective: Objective) -> Self { 11 | self.objective = objective; 12 | self 13 | } 14 | 15 | /// Set the budget on the booster. 16 | /// * `budget` - Budget to fit the booster. 17 | pub fn set_budget(mut self, budget: f32) -> Self { 18 | self.budget = budget; 19 | self 20 | } 21 | 22 | /// Set the base_score on the booster. 23 | /// * `base_score` - The base score of the booster. 24 | pub fn set_base_score(mut self, base_score: f64) -> Self { 25 | self.base_score = base_score; 26 | self 27 | } 28 | 29 | /// Set the number of bins on the booster. 30 | /// * `max_bin` - Number of bins to calculate to partition the data. Setting this to 31 | /// a smaller number, will result in faster training time, while potentially sacrificing 32 | /// accuracy. If there are more bins, than unique values in a column, all unique values 33 | /// will be used. 34 | pub fn set_max_bin(mut self, max_bin: u16) -> Self { 35 | self.max_bin = max_bin; 36 | self 37 | } 38 | 39 | /// Set the number of threads on the booster. 40 | /// * `num_threads` - Set the number of threads to be used during training. 41 | pub fn set_num_threads(mut self, num_threads: Option) -> Self { 42 | self.num_threads = num_threads; 43 | self 44 | } 45 | 46 | /// Set the monotone_constraints on the booster. 47 | /// * `monotone_constraints` - The monotone constraints of the booster. 48 | pub fn set_monotone_constraints(mut self, monotone_constraints: Option) -> Self { 49 | self.monotone_constraints = monotone_constraints; 50 | self 51 | } 52 | 53 | /// Set the force_children_to_bound_parent on the booster. 54 | /// * `force_children_to_bound_parent` - Set force children to bound parent. 55 | pub fn set_force_children_to_bound_parent(mut self, force_children_to_bound_parent: bool) -> Self { 56 | self.force_children_to_bound_parent = force_children_to_bound_parent; 57 | self 58 | } 59 | 60 | /// Set missing value of the booster 61 | /// * `missing` - Float value to consider as missing. 62 | pub fn set_missing(mut self, missing: f64) -> Self { 63 | self.missing = missing; 64 | self 65 | } 66 | 67 | /// Set the allow_missing_splits on the booster. 68 | /// * `allow_missing_splits` - Set if missing splits are allowed for the booster. 69 | pub fn set_allow_missing_splits(mut self, allow_missing_splits: bool) -> Self { 70 | self.allow_missing_splits = allow_missing_splits; 71 | self 72 | } 73 | 74 | /// Set create missing value of the booster 75 | /// * `create_missing_branch` - Bool specifying if missing should get it's own 76 | /// branch. 77 | pub fn set_create_missing_branch(mut self, create_missing_branch: bool) -> Self { 78 | self.create_missing_branch = create_missing_branch; 79 | self 80 | } 81 | 82 | /// Set the features where whose missing nodes should 83 | /// always be terminated. 84 | /// * `terminate_missing_features` - Hashset of the feature indices for the 85 | /// features that should always terminate the missing node, if create_missing_branch 86 | /// is true. 87 | pub fn set_terminate_missing_features(mut self, terminate_missing_features: HashSet) -> Self { 88 | self.terminate_missing_features = terminate_missing_features; 89 | self 90 | } 91 | 92 | /// Set the missing_node_treatment on the booster. 93 | /// * `missing_node_treatment` - The missing node treatment of the booster. 94 | pub fn set_missing_node_treatment(mut self, missing_node_treatment: MissingNodeTreatment) -> Self { 95 | self.missing_node_treatment = missing_node_treatment; 96 | self 97 | } 98 | 99 | /// Set the log iterations on the booster. 100 | /// * `log_iterations` - The number of log iterations of the booster. 101 | pub fn set_log_iterations(mut self, log_iterations: usize) -> Self { 102 | self.log_iterations = log_iterations; 103 | self 104 | } 105 | 106 | /// Set the log iterations on the booster. 107 | /// * `log_iterations` - The number of log iterations of the booster. 108 | pub fn set_ref_log_iterations(mut self, log_iterations: usize) -> Self { 109 | self.log_iterations = log_iterations; 110 | self 111 | } 112 | 113 | /// Set the seed on the booster. 114 | /// * `seed` - Integer value used to see any randomness used in the algorithm. 115 | pub fn set_seed(mut self, seed: u64) -> Self { 116 | self.seed = seed; 117 | self 118 | } 119 | 120 | /// Set the quantile on the booster. 121 | /// * `quantile` - used only in quantile regression. 122 | pub fn set_quantile(mut self, quantile: Option) -> Self { 123 | self.quantile = quantile; 124 | self 125 | } 126 | 127 | /// Set the reset on the booster. 128 | /// * `reset` - Reset the model or continue training. 129 | pub fn set_reset(mut self, reset: Option) -> Self { 130 | self.reset = reset; 131 | self 132 | } 133 | 134 | /// Set the categorical features on the booster. 135 | /// * `categorical_features` - categorical features. 136 | pub fn set_categorical_features(mut self, categorical_features: Option>) -> Self { 137 | self.categorical_features = categorical_features; 138 | self 139 | } 140 | 141 | /// Set the timeout on the booster. 142 | /// * `timeout` - fit timeout limit in seconds. 143 | pub fn set_timeout(mut self, timeout: Option) -> Self { 144 | self.timeout = timeout; 145 | self 146 | } 147 | 148 | /// Set the iteration limit on the booster. 149 | /// * `iteration_limit` - optional limit for the number of boosting rounds. 150 | pub fn set_iteration_limit(mut self, iteration_limit: Option) -> Self { 151 | self.iteration_limit = iteration_limit; 152 | self 153 | } 154 | 155 | /// Set the memory limit on the booster. 156 | /// * `memory_limit` - optional limit for memory allocation. 157 | pub fn set_memory_limit(mut self, memory_limit: Option) -> Self { 158 | self.memory_limit = memory_limit; 159 | self 160 | } 161 | 162 | /// Set the stopping rounds on the booster. 163 | /// * `stopping_rounds` - optional limit for auto stopping rounds. 164 | pub fn set_stopping_rounds(mut self, stopping_rounds: Option) -> Self { 165 | self.stopping_rounds = stopping_rounds; 166 | self 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/conformal/cqr.rs: -------------------------------------------------------------------------------- 1 | use crate::{errors::PerpetualError, objective_functions::Objective, utils::percentiles, Matrix, PerpetualBooster}; 2 | use std::collections::HashMap; 3 | 4 | pub type CalData<'a> = (Matrix<'a, f64>, &'a [f64], &'a [f64]); // (x_flat_data, rows, cols), y, alpha 5 | 6 | impl PerpetualBooster { 7 | /// Calibrate models to get prediction intervals 8 | /// * `alpha` - Alpha list to train calibration models for 9 | pub fn calibrate( 10 | &mut self, 11 | data: &Matrix, 12 | y: &[f64], 13 | sample_weight: Option<&[f64]>, 14 | data_cal: CalData, 15 | ) -> Result<(), PerpetualError> { 16 | let (x_cal, y_cal, alpha) = data_cal; 17 | 18 | for alpha_ in alpha { 19 | let lower_quantile = Some(alpha_ / 2.0); 20 | let mut model_lower = PerpetualBooster::default() 21 | .set_objective(Objective::QuantileLoss) 22 | .set_quantile(lower_quantile); 23 | model_lower.fit(&data, &y, sample_weight)?; 24 | 25 | let upper_quantile = Some(1.0 - alpha_ / 2.0); 26 | let mut model_upper = PerpetualBooster::default() 27 | .set_objective(Objective::QuantileLoss) 28 | .set_quantile(upper_quantile); 29 | model_upper.fit(&data, &y, sample_weight)?; 30 | 31 | let y_cal_pred_lower = model_lower.predict(&x_cal, true); 32 | let y_cal_pred_upper = model_upper.predict(&x_cal, true); 33 | let mut scores: Vec = Vec::with_capacity(y_cal.len()); 34 | for i in 0..y_cal.len() { 35 | scores.push(f64::max(y_cal_pred_lower[i] - y_cal[i], y_cal[i] - y_cal_pred_upper[i])); 36 | } 37 | let perc = (1.0 - (*alpha_ as f64)) * (1.0 + 1.0 * (1.0 / (scores.len() as f64))); 38 | let score = percentiles(&scores, &vec![1.0; scores.len()], &vec![perc])[0]; 39 | self.cal_models 40 | .insert(alpha_.to_string(), [(model_lower, -score), (model_upper, score)]); 41 | } 42 | Ok(()) 43 | } 44 | 45 | pub fn predict_intervals(&self, data: &Matrix, parallel: bool) -> HashMap>> { 46 | let mut intervals = HashMap::new(); 47 | for (alpha, value) in &self.cal_models { 48 | let (model_lower, score_lower) = &value[0]; 49 | let (model_upper, score_upper) = &value[1]; 50 | let lower_preds = model_lower 51 | .predict(data, parallel) 52 | .iter() 53 | .map(|p| p + score_lower) 54 | .collect(); 55 | let upper_preds = model_upper 56 | .predict(data, parallel) 57 | .iter() 58 | .map(|p| p + score_upper) 59 | .collect(); 60 | intervals.insert(alpha.to_string(), vec![lower_preds, upper_preds]); 61 | } 62 | intervals 63 | } 64 | } 65 | 66 | #[cfg(test)] 67 | mod tests { 68 | use super::*; 69 | use crate::objective_functions::Objective; 70 | use polars::io::SerReader; 71 | use polars::prelude::{CsvReadOptions, DataType}; 72 | use std::error::Error; 73 | use std::sync::Arc; 74 | 75 | #[test] 76 | fn test_cqr() -> Result<(), Box> { 77 | let all_names = [ 78 | "MedInc".to_string(), 79 | "HouseAge".to_string(), 80 | "AveRooms".to_string(), 81 | "AveBedrms".to_string(), 82 | "Population".to_string(), 83 | "AveOccup".to_string(), 84 | "Latitude".to_string(), 85 | "Longitude".to_string(), 86 | "MedHouseVal".to_string(), 87 | ]; 88 | 89 | let feature_names = [ 90 | "MedInc".to_string(), 91 | "HouseAge".to_string(), 92 | "AveRooms".to_string(), 93 | "AveBedrms".to_string(), 94 | "Population".to_string(), 95 | "AveOccup".to_string(), 96 | "Latitude".to_string(), 97 | "Longitude".to_string(), 98 | ]; 99 | 100 | let column_names_train = Arc::new(all_names.clone()); 101 | let column_names_test = Arc::new(all_names.clone()); 102 | 103 | let df_train = CsvReadOptions::default() 104 | .with_has_header(true) 105 | .with_columns(Some(column_names_train)) 106 | .try_into_reader_with_file_path(Some("resources/cal_housing_train.csv".into()))? 107 | .finish() 108 | .unwrap(); 109 | 110 | let df_test = CsvReadOptions::default() 111 | .with_has_header(true) 112 | .with_columns(Some(column_names_test)) 113 | .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))? 114 | .finish() 115 | .unwrap(); 116 | 117 | // Get data in column major format... 118 | let id_vars_train: Vec<&str> = Vec::new(); 119 | let mdf_train = df_train.unpivot(feature_names.clone(), &id_vars_train)?; 120 | let id_vars_test: Vec<&str> = Vec::new(); 121 | let mdf_test = df_test.unpivot(feature_names, &id_vars_test)?; 122 | 123 | let data_train = Vec::from_iter( 124 | mdf_train 125 | .select_at_idx(1) 126 | .expect("Invalid column") 127 | .f64()? 128 | .into_iter() 129 | .map(|v| v.unwrap_or(f64::NAN)), 130 | ); 131 | let data_test = Vec::from_iter( 132 | mdf_test 133 | .select_at_idx(1) 134 | .expect("Invalid column") 135 | .f64()? 136 | .into_iter() 137 | .map(|v| v.unwrap_or(f64::NAN)), 138 | ); 139 | 140 | let y_train = Vec::from_iter( 141 | df_train 142 | .column("MedHouseVal")? 143 | .cast(&DataType::Float64)? 144 | .f64()? 145 | .into_iter() 146 | .map(|v| v.unwrap_or(f64::NAN)), 147 | ); 148 | let y_test = Vec::from_iter( 149 | df_test 150 | .column("MedHouseVal")? 151 | .cast(&DataType::Float64)? 152 | .f64()? 153 | .into_iter() 154 | .map(|v| v.unwrap_or(f64::NAN)), 155 | ); 156 | 157 | // Create Matrix from ndarray. 158 | let matrix_train = Matrix::new(&data_train, y_train.len(), 8); 159 | let matrix_test = Matrix::new(&data_test, y_test.len(), 8); 160 | 161 | let mut model = PerpetualBooster::default() 162 | .set_objective(Objective::SquaredLoss) 163 | .set_max_bin(10) 164 | .set_budget(0.1); 165 | 166 | model.fit(&matrix_train, &y_train, None)?; 167 | 168 | let alpha = vec![0.1]; 169 | let data_cal = (matrix_test, y_test.as_slice(), alpha.as_slice()); 170 | 171 | model.calibrate(&matrix_train, &y_train, None, data_cal)?; 172 | 173 | let matrix_test = Matrix::new(&data_test, y_test.len(), 8); 174 | let _intervals = model.predict_intervals(&matrix_test, true); 175 | 176 | Ok(()) 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/conformal/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod cqr; 2 | -------------------------------------------------------------------------------- /src/constants.rs: -------------------------------------------------------------------------------- 1 | pub const STOPPING_ROUNDS: usize = 3; 2 | pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9; 3 | pub const N_NODES_ALLOC_MIN: usize = 100; 4 | pub const N_NODES_ALLOC_MAX: usize = 10000; 5 | pub const ITER_LIMIT: usize = 1000; 6 | pub const GENERALIZATION_THRESHOLD: f32 = 1.0; 7 | pub const GENERALIZATION_THRESHOLD_RELAXED: f32 = 0.99; 8 | pub const MIN_COL_AMOUNT: usize = 40; 9 | pub const HESSIAN_EPS: f32 = 1e-3; 10 | -------------------------------------------------------------------------------- /src/constraints.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use std::collections::HashMap; 3 | 4 | #[derive(Debug, Deserialize, Serialize, Clone, Copy)] 5 | pub enum Constraint { 6 | Positive, 7 | Negative, 8 | Unconstrained, 9 | } 10 | 11 | pub type ConstraintMap = HashMap; 12 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | #[derive(Debug, Error)] 4 | pub enum PerpetualError { 5 | #[error("Feature number {0} has no variance, when missing values are excluded.")] 6 | NoVariance(usize), 7 | #[error("Unable to write model to file: {0}")] 8 | UnableToWrite(String), 9 | #[error("Unable to read model from a file {0}")] 10 | UnableToRead(String), 11 | #[error("The value {0} is set to missing, but a NaN value was found in the data.")] 12 | NANVAlueFound(f64), 13 | #[error("Invalid value {0} passed for {1}, expected one of {2}.")] 14 | ParseString(String, String, String), 15 | /// First value is the name of the parameter, second is expected, third is what was passed. 16 | #[error("Invalid parameter value passed for {0}, expected {1} but {2} provided.")] 17 | InvalidParameter(String, String, String), 18 | } 19 | -------------------------------------------------------------------------------- /src/grower.rs: -------------------------------------------------------------------------------- 1 | use serde::Deserialize; 2 | use serde::Serialize; 3 | 4 | use crate::node::SplittableNode; 5 | use std::collections::BinaryHeap; 6 | use std::collections::VecDeque; 7 | 8 | pub trait Grower { 9 | fn add_node(&mut self, node: SplittableNode); 10 | fn get_next_node(&mut self) -> SplittableNode; 11 | fn is_empty(&self) -> bool; 12 | } 13 | 14 | impl Grower for BinaryHeap { 15 | fn add_node(&mut self, node: SplittableNode) { 16 | self.push(node); 17 | } 18 | 19 | fn get_next_node(&mut self) -> SplittableNode { 20 | self.pop().expect("Grower should not be empty") 21 | } 22 | 23 | fn is_empty(&self) -> bool { 24 | self.is_empty() 25 | } 26 | } 27 | 28 | impl Grower for VecDeque { 29 | fn add_node(&mut self, node: SplittableNode) { 30 | self.push_front(node); 31 | } 32 | 33 | fn get_next_node(&mut self) -> SplittableNode { 34 | self.pop_back().expect("Grower should not be empty") 35 | } 36 | 37 | fn is_empty(&self) -> bool { 38 | self.is_empty() 39 | } 40 | } 41 | 42 | #[derive(Serialize, Deserialize)] 43 | pub enum GrowPolicy { 44 | DepthWise, 45 | LossGuide, 46 | } 47 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(array_ptr_get)] 2 | 3 | mod node; 4 | mod partial_dependence; 5 | mod shapley; 6 | 7 | // Modules 8 | pub mod objective_functions; 9 | pub mod bin; 10 | pub mod binning; 11 | pub mod booster; 12 | pub mod conformal; 13 | pub mod constants; 14 | pub mod constraints; 15 | pub mod data; 16 | pub mod errors; 17 | pub mod grower; 18 | pub mod histogram; 19 | pub mod metrics; 20 | pub mod prune; 21 | pub mod sampler; 22 | pub mod splitter; 23 | pub mod tree; 24 | pub mod utils; 25 | 26 | // Individual classes, and functions 27 | pub use booster::booster::PerpetualBooster; 28 | pub use booster::multi_output::MultiOutputBooster; 29 | pub use data::Matrix; 30 | -------------------------------------------------------------------------------- /src/metrics/classification/metrics.rs: -------------------------------------------------------------------------------- 1 | use crate::metrics::*; 2 | 3 | pub struct LogLossMetric {} 4 | impl EvaluationMetric for LogLossMetric { 5 | fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option) -> f64 { 6 | log_loss(y, yhat, sample_weight) 7 | } 8 | fn maximize() -> bool { 9 | false 10 | } 11 | } 12 | 13 | pub struct AUCMetric {} 14 | impl EvaluationMetric for AUCMetric { 15 | fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option) -> f64 { 16 | roc_auc_score(y, yhat, sample_weight) 17 | } 18 | fn maximize() -> bool { 19 | true 20 | } 21 | } 22 | 23 | pub fn log_loss(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 { 24 | let mut w_sum = 0.; 25 | let res = y 26 | .iter() 27 | .zip(yhat) 28 | .zip(sample_weight) 29 | .map(|((y_, yhat_), w_)| { 30 | w_sum += *w_; 31 | let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); 32 | -(*y_ * yhat_.ln() + (f64::ONE - *y_) * ((f64::ONE - yhat_).ln())) * *w_ 33 | }) 34 | .sum::(); 35 | res / w_sum 36 | } 37 | 38 | fn trapezoid_area(x0: f64, x1: f64, y0: f64, y1: f64) -> f64 { 39 | (x0 - x1).abs() * (y0 + y1) * 0.5 40 | } 41 | 42 | pub fn roc_auc_score(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 { 43 | let mut indices = (0..y.len()).collect::>(); 44 | indices.sort_unstable_by(|&a, &b| yhat[b].total_cmp(&yhat[a])); 45 | let mut auc: f64 = 0.0; 46 | 47 | let mut label = y[indices[0]]; 48 | let mut w = sample_weight[indices[0]]; 49 | let mut fp = (1.0 - label) * w; 50 | let mut tp: f64 = label * w; 51 | let mut tp_prev: f64 = 0.0; 52 | let mut fp_prev: f64 = 0.0; 53 | 54 | for i in 1..indices.len() { 55 | if yhat[indices[i]] != yhat[indices[i - 1]] { 56 | auc += trapezoid_area(fp_prev, fp, tp_prev, tp); 57 | tp_prev = tp; 58 | fp_prev = fp; 59 | } 60 | label = y[indices[i]]; 61 | w = sample_weight[indices[i]]; 62 | fp += (1.0 - label) * w; 63 | tp += label * w; 64 | } 65 | 66 | auc += trapezoid_area(fp_prev, fp, tp_prev, tp); 67 | if fp <= 0.0 || tp <= 0.0 { 68 | auc = 0.0; 69 | fp = 0.0; 70 | tp = 0.0; 71 | } 72 | 73 | auc / (tp * fp) 74 | } 75 | 76 | -------------------------------------------------------------------------------- /src/metrics/classification/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod metrics; 2 | pub use metrics::*; -------------------------------------------------------------------------------- /src/metrics/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod classification; 2 | pub mod regression; 3 | 4 | use crate::data::FloatData; 5 | use crate::errors::PerpetualError; 6 | use crate::utils::items_to_strings; 7 | use serde::{Deserialize, Serialize}; 8 | use std::str::FromStr; 9 | 10 | pub type MetricFn = fn(&[f64], &[f64], &[f64], Option) -> f64; 11 | 12 | /// Compare to metric values, determining if b is better. 13 | /// If one of them is NaN favor the non NaN value. 14 | /// If both are NaN, consider the first value to be better. 15 | pub fn is_comparison_better(value: f64, comparison: f64, maximize: bool) -> bool { 16 | match (value.is_nan(), comparison.is_nan()) { 17 | // Both nan, comparison is not better, 18 | // Or comparison is nan, also not better 19 | (true, true) | (false, true) => false, 20 | // comparison is not Nan, it's better 21 | (true, false) => true, 22 | // Perform numerical comparison. 23 | (false, false) => { 24 | // If we are maximizing is the comparison 25 | // greater, than the current value 26 | if maximize { 27 | value < comparison 28 | // If we are minimizing is the comparison 29 | // less than the current value. 30 | } else { 31 | value > comparison 32 | } 33 | } 34 | } 35 | } 36 | 37 | #[derive(Debug, Deserialize, Serialize, Clone, Copy)] 38 | pub enum Metric { 39 | AUC, 40 | LogLoss, 41 | RootMeanSquaredLogError, 42 | RootMeanSquaredError, 43 | QuantileLoss, 44 | } 45 | 46 | impl FromStr for Metric { 47 | type Err = PerpetualError; 48 | 49 | fn from_str(s: &str) -> Result { 50 | match s { 51 | "AUC" => Ok(Metric::AUC), 52 | "LogLoss" => Ok(Metric::LogLoss), 53 | "RootMeanSquaredLogError" => Ok(Metric::RootMeanSquaredLogError), 54 | "RootMeanSquaredError" => Ok(Metric::RootMeanSquaredError), 55 | 56 | _ => Err(PerpetualError::ParseString( 57 | s.to_string(), 58 | "Metric".to_string(), 59 | items_to_strings(vec![ 60 | "AUC", 61 | "LogLoss", 62 | "RootMeanSquaredLogError", 63 | "RootMeanSquaredError", 64 | ]), 65 | )), 66 | } 67 | } 68 | } 69 | 70 | pub fn metric_callables(metric_type: &Metric) -> (MetricFn, bool) { 71 | match metric_type { 72 | Metric::AUC => ( 73 | classification::AUCMetric::calculate_metric, 74 | classification::AUCMetric::maximize(), 75 | ), 76 | Metric::LogLoss => ( 77 | classification::LogLossMetric::calculate_metric, 78 | classification::LogLossMetric::maximize(), 79 | ), 80 | Metric::RootMeanSquaredLogError => ( 81 | regression::RootMeanSquaredLogErrorMetric::calculate_metric, 82 | regression::RootMeanSquaredLogErrorMetric::maximize(), 83 | ), 84 | Metric::RootMeanSquaredError => ( 85 | regression::RootMeanSquaredErrorMetric::calculate_metric, 86 | regression::RootMeanSquaredErrorMetric::maximize(), 87 | ), 88 | Metric::QuantileLoss => ( 89 | regression::QuantileLossMetric::calculate_metric, 90 | regression::QuantileLossMetric::maximize(), 91 | ), 92 | } 93 | } 94 | 95 | pub trait EvaluationMetric { 96 | fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], alpha: Option) -> f64; 97 | fn maximize() -> bool; 98 | } 99 | 100 | #[cfg(test)] 101 | mod tests { 102 | use crate::metrics::classification::*; 103 | use crate::metrics::regression::*; 104 | use crate::utils::precision_round; 105 | #[test] 106 | fn test_root_mean_squared_log_error() { 107 | let y = vec![1., 3., 4., 5., 2., 4., 6.]; 108 | let yhat = vec![3., 2., 3., 4., 4., 4., 4.]; 109 | let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.]; 110 | let res = root_mean_squared_log_error(&y, &yhat, &sample_weight); 111 | assert_eq!(precision_round(res, 4), 0.3549); 112 | } 113 | #[test] 114 | fn test_root_mean_squared_error() { 115 | let y = vec![1., 3., 4., 5., 2., 4., 6.]; 116 | let yhat = vec![3., 2., 3., 4., 4., 4., 4.]; 117 | let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.]; 118 | let res = root_mean_squared_error(&y, &yhat, &sample_weight); 119 | assert_eq!(precision_round(res, 6), 1.452966); 120 | } 121 | 122 | #[test] 123 | fn test_log_loss() { 124 | let y = vec![1., 0., 1., 0., 0., 0., 0.]; 125 | let yhat = vec![0.5, 0.01, -0., 1.05, 0., -4., 0.]; 126 | let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.]; 127 | let res = log_loss(&y, &yhat, &sample_weight); 128 | assert_eq!(precision_round(res, 5), 0.59235); 129 | } 130 | 131 | #[test] 132 | fn test_auc_real_data() { 133 | let y = vec![1., 0., 1., 0., 0., 0., 0.]; 134 | let yhat = vec![0.5, 0.01, -0., 1.05, 0., -4., 0.]; 135 | let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.]; 136 | let res = roc_auc_score(&y, &yhat, &sample_weight); 137 | assert_eq!(precision_round(res, 5), 0.67857); 138 | } 139 | 140 | #[test] 141 | fn test_auc_generc() { 142 | let sample_weight: Vec = vec![1.; 2]; 143 | 144 | let y: Vec = vec![0., 1.]; 145 | let yhat: Vec = vec![0., 1.]; 146 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 147 | assert_eq!(auc_score, 1.); 148 | 149 | let y: Vec = vec![0., 1.]; 150 | let yhat: Vec = vec![1., 0.]; 151 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 152 | assert_eq!(auc_score, 0.); 153 | 154 | let y: Vec = vec![1., 0.]; 155 | let yhat: Vec = vec![1., 1.]; 156 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 157 | assert_eq!(auc_score, 0.5); 158 | 159 | let y: Vec = vec![1., 0.]; 160 | let yhat: Vec = vec![1., 0.]; 161 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 162 | assert_eq!(auc_score, 1.0); 163 | 164 | let y: Vec = vec![1., 0.]; 165 | let yhat: Vec = vec![0.5, 0.5]; 166 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 167 | assert_eq!(auc_score, 0.5); 168 | 169 | let y: Vec = vec![0., 0.]; 170 | let yhat: Vec = vec![0.25, 0.75]; 171 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 172 | assert!(auc_score.is_nan()); 173 | 174 | let y: Vec = vec![1., 1.]; 175 | let yhat: Vec = vec![0.25, 0.75]; 176 | let auc_score = roc_auc_score(&y, &yhat, &sample_weight); 177 | assert!(auc_score.is_nan()); 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/metrics/regression/metrics.rs: -------------------------------------------------------------------------------- 1 | use crate::metrics::*; 2 | 3 | pub struct QuantileLossMetric {} 4 | impl EvaluationMetric for QuantileLossMetric { 5 | fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], alpha: Option) -> f64 { 6 | quantile_loss(y, yhat, sample_weight, alpha) 7 | } 8 | fn maximize() -> bool { 9 | false 10 | } 11 | } 12 | 13 | pub struct RootMeanSquaredLogErrorMetric {} 14 | impl EvaluationMetric for RootMeanSquaredLogErrorMetric { 15 | fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option) -> f64 { 16 | root_mean_squared_log_error(y, yhat, sample_weight) 17 | } 18 | fn maximize() -> bool { 19 | false 20 | } 21 | } 22 | 23 | pub struct RootMeanSquaredErrorMetric {} 24 | impl EvaluationMetric for RootMeanSquaredErrorMetric { 25 | fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option) -> f64 { 26 | root_mean_squared_error(y, yhat, sample_weight) 27 | } 28 | fn maximize() -> bool { 29 | false 30 | } 31 | } 32 | 33 | pub fn quantile_loss(y: &[f64], yhat: &[f64], sample_weight: &[f64], alpha: Option) -> f64 { 34 | let mut w_sum = 0.; 35 | let res = y 36 | .iter() 37 | .zip(yhat) 38 | .zip(sample_weight) 39 | .map(|((y_, yhat_), w_)| { 40 | w_sum += *w_; 41 | let _alpha = alpha.unwrap() as f64; 42 | let s = *y_ - *yhat_; 43 | let l = if s >= 0.0 { _alpha * s } else { (1.0 - _alpha) * s }; 44 | l * *w_ 45 | }) 46 | .sum::(); 47 | res / w_sum 48 | } 49 | 50 | pub fn root_mean_squared_log_error(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 { 51 | let mut w_sum = 0.; 52 | let res = y 53 | .iter() 54 | .zip(yhat) 55 | .zip(sample_weight) 56 | .map(|((y_, yhat_), w_)| { 57 | w_sum += *w_; 58 | (y_.ln_1p() - yhat_.ln_1p()).powi(2) * *w_ 59 | }) 60 | .sum::(); 61 | (res / w_sum).sqrt() 62 | } 63 | 64 | pub fn root_mean_squared_error(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 { 65 | let mut w_sum = 0.; 66 | let res = y 67 | .iter() 68 | .zip(yhat) 69 | .zip(sample_weight) 70 | .map(|((y_, yhat_), w_)| { 71 | w_sum += *w_; 72 | (y_ - yhat_).powi(2) * *w_ 73 | }) 74 | .sum::(); 75 | (res / w_sum).sqrt() 76 | } -------------------------------------------------------------------------------- /src/metrics/regression/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod metrics; 2 | pub use metrics::*; -------------------------------------------------------------------------------- /src/objective_functions/adaptive_huber_loss.rs: -------------------------------------------------------------------------------- 1 | use super::ObjectiveFunction; 2 | use crate::metrics::Metric; 3 | 4 | /// Adaptive Huber Loss 5 | #[derive(Default)] 6 | pub struct AdaptiveHuberLoss {} 7 | impl ObjectiveFunction for AdaptiveHuberLoss { 8 | fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, quantile: Option) -> Vec { 9 | // default alpha: 0.5 10 | // if not passed explicitly 11 | let alpha = quantile.unwrap_or(0.5); 12 | let n = y.len(); 13 | 14 | let mut abs_res = y 15 | .iter() 16 | .zip(yhat.iter()) 17 | .map(|(&yi, &yh)| (yi - yh).abs()) 18 | .collect::>(); 19 | abs_res.sort_by(|a, b| a.partial_cmp(b).unwrap()); 20 | 21 | let idx = ((n as f64) * alpha).floor() as usize; 22 | let delta = abs_res[idx.min(n - 1)]; 23 | 24 | match sample_weight { 25 | Some(weights) => y 26 | .iter() 27 | .zip(yhat.iter()) 28 | .enumerate() 29 | .map(|(i, (&yi, &yh))| { 30 | let r = yi - yh; 31 | let ar = r.abs(); 32 | let base = if ar <= delta { 33 | 0.5 * r * r 34 | } else { 35 | delta * (ar - 0.5 * delta) 36 | }; 37 | (base * weights[i]) as f32 38 | }) 39 | .collect(), 40 | None => y 41 | .iter() 42 | .zip(yhat.iter()) 43 | .map(|(&yi, &yh)| { 44 | let r = yi - yh; 45 | let ar = r.abs(); 46 | let loss = if ar <= delta { 47 | 0.5 * r * r 48 | } else { 49 | delta * (ar - 0.5 * delta) 50 | }; 51 | loss as f32 52 | }) 53 | .collect(), 54 | } 55 | } 56 | 57 | fn calc_grad_hess( 58 | y: &[f64], 59 | yhat: &[f64], 60 | sample_weight: Option<&[f64]>, 61 | quantile: Option, 62 | ) -> (Vec, Option>) { 63 | // default alpha: 0.5 64 | // if not passed explicitly 65 | let alpha = quantile.unwrap_or(0.5); 66 | let n = y.len(); 67 | 68 | let mut abs_res = y 69 | .iter() 70 | .zip(yhat.iter()) 71 | .map(|(&yi, &yh)| (yi - yh).abs()) 72 | .collect::>(); 73 | abs_res.sort_by(|a, b| a.partial_cmp(b).unwrap()); 74 | let idx = ((n as f64) * alpha).floor() as usize; 75 | let delta = abs_res[idx.min(n - 1)]; 76 | 77 | match sample_weight { 78 | Some(weights) => { 79 | let (grad, hess): (Vec, Vec) = y 80 | .iter() 81 | .zip(yhat.iter()) 82 | .enumerate() 83 | .map(|(i, (&yi, &yh))| { 84 | let r = yi - yh; 85 | let ar = r.abs(); 86 | let sign = (yh - yi).signum(); 87 | let g = if ar <= delta { 88 | (yh - yi) * weights[i] 89 | } else { 90 | delta * sign * weights[i] 91 | }; 92 | let h = if ar <= delta { weights[i] } else { 0.0 }; 93 | (g as f32, h as f32) 94 | }) 95 | .unzip(); 96 | (grad, Some(hess)) 97 | } 98 | None => { 99 | let (grad, hess): (Vec, Vec) = y 100 | .iter() 101 | .zip(yhat.iter()) 102 | .map(|(&yi, &yh)| { 103 | let r = yi - yh; 104 | let ar = r.abs(); 105 | let sign = (yh - yi).signum(); 106 | let g = if ar <= delta { yh - yi } else { delta * sign }; 107 | let h = if ar <= delta { 1.0 } else { 0.0 }; 108 | (g as f32, h as f32) 109 | }) 110 | .unzip(); 111 | (grad, Some(hess)) 112 | } 113 | } 114 | } 115 | 116 | fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option) -> f64 { 117 | let mut idxs = (0..y.len()).collect::>(); 118 | idxs.sort_by(|&i, &j| y[i].partial_cmp(&y[j]).unwrap()); 119 | 120 | let total_w = sample_weight.map(|w| w.iter().sum::()).unwrap_or(y.len() as f64); 121 | let target = total_w * 0.5; 122 | 123 | // find weighted median via scan() 124 | let median = idxs 125 | .iter() 126 | .scan(0.0, |cum, &i| { 127 | *cum += sample_weight.map_or(1.0, |w| w[i]); 128 | Some((i, *cum)) 129 | }) 130 | .find(|&(_i, cum)| cum >= target) 131 | .map(|(i, _)| y[i]) 132 | .unwrap_or(y[idxs[y.len() / 2]]); 133 | 134 | median 135 | } 136 | 137 | fn default_metric() -> Metric { 138 | Metric::RootMeanSquaredError 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/objective_functions/huber_loss.rs: -------------------------------------------------------------------------------- 1 | use super::ObjectiveFunction; 2 | use crate::metrics::Metric; 3 | 4 | /// Huber Loss 5 | #[derive(Default)] 6 | pub struct HuberLoss {} 7 | impl ObjectiveFunction for HuberLoss { 8 | fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, delta: Option) -> Vec { 9 | // Default delta value 10 | let delta = delta.unwrap_or(1.0); 11 | match sample_weight { 12 | Some(weights) => y 13 | .iter() 14 | .zip(yhat.iter()) 15 | .enumerate() 16 | .map(|(i, (&yi, &yh))| { 17 | let r = yi - yh; 18 | let ar = r.abs(); 19 | let base = if ar <= delta { 20 | 0.5 * r * r 21 | } else { 22 | delta * (ar - 0.5 * delta) 23 | }; 24 | (base * weights[i]) as f32 25 | }) 26 | .collect(), 27 | None => y 28 | .iter() 29 | .zip(yhat.iter()) 30 | .map(|(&yi, &yh)| { 31 | let r = yi - yh; 32 | let ar = r.abs(); 33 | let loss = if ar <= delta { 34 | 0.5 * r * r 35 | } else { 36 | delta * (ar - 0.5 * delta) 37 | }; 38 | loss as f32 39 | }) 40 | .collect(), 41 | } 42 | } 43 | 44 | fn calc_grad_hess( 45 | y: &[f64], 46 | yhat: &[f64], 47 | sample_weight: Option<&[f64]>, 48 | delta: Option, 49 | ) -> (Vec, Option>) { 50 | // default delta value 51 | let delta = delta.unwrap_or(1.0); 52 | 53 | match sample_weight { 54 | Some(weights) => { 55 | let (grad, hess): (Vec, Vec) = y 56 | .iter() 57 | .zip(yhat.iter()) 58 | .enumerate() 59 | .map(|(i, (&yi, &yh))| { 60 | let r = yi - yh; 61 | let ar = r.abs(); 62 | let sign = (yh - yi).signum(); 63 | let g = if ar <= delta { 64 | (yh - yi) * weights[i] 65 | } else { 66 | delta * sign * weights[i] 67 | }; 68 | let h = if ar <= delta { weights[i] } else { 0.0 }; 69 | (g as f32, h as f32) 70 | }) 71 | .unzip(); 72 | (grad, Some(hess)) 73 | } 74 | None => { 75 | let (grad, hess): (Vec, Vec) = y 76 | .iter() 77 | .zip(yhat.iter()) 78 | .map(|(&yi, &yh)| { 79 | let r = yi - yh; 80 | let ar = r.abs(); 81 | let sign = (yh - yi).signum(); 82 | let g = if ar <= delta { yh - yi } else { delta * sign }; 83 | let h = if ar <= delta { 1.0 } else { 0.0 }; 84 | (g as f32, h as f32) 85 | }) 86 | .unzip(); 87 | (grad, Some(hess)) 88 | } 89 | } 90 | } 91 | 92 | fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option) -> f64 { 93 | let mut idxs = (0..y.len()).collect::>(); 94 | idxs.sort_by(|&i, &j| y[i].partial_cmp(&y[j]).unwrap()); 95 | 96 | let total_w = sample_weight.map(|w| w.iter().sum::()).unwrap_or(y.len() as f64); 97 | let target = total_w * 0.5; 98 | 99 | let median = idxs 100 | .iter() 101 | .scan(0.0, |cum, &i| { 102 | *cum += sample_weight.map_or(1.0, |w| w[i]); 103 | Some((i, *cum)) 104 | }) 105 | .find(|&(_i, cum)| cum >= target) 106 | .map(|(i, _)| y[i]) 107 | .unwrap_or(y[idxs[y.len() / 2]]); 108 | 109 | median 110 | } 111 | 112 | fn default_metric() -> Metric { 113 | Metric::RootMeanSquaredError 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/objective_functions/log_loss.rs: -------------------------------------------------------------------------------- 1 | use super::ObjectiveFunction; 2 | use crate::{data::FloatData, metrics::Metric, utils::fast_sum}; 3 | 4 | #[derive(Default)] 5 | pub struct LogLoss {} 6 | 7 | impl ObjectiveFunction for LogLoss { 8 | #[inline] 9 | fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, _quantile: Option) -> Vec { 10 | match sample_weight { 11 | Some(sample_weight) => y 12 | .iter() 13 | .zip(yhat) 14 | .zip(sample_weight) 15 | .map(|((y_, yhat_), w_)| { 16 | let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); 17 | (-(*y_ * yhat_.ln() + (f64::ONE - *y_) * ((f64::ONE - yhat_).ln())) * *w_) as f32 18 | }) 19 | .collect(), 20 | None => y 21 | .iter() 22 | .zip(yhat) 23 | .map(|(y_, yhat_)| { 24 | let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); 25 | (-(*y_ * yhat_.ln() + (f64::ONE - *y_) * ((f64::ONE - yhat_).ln()))) as f32 26 | }) 27 | .collect(), 28 | } 29 | } 30 | 31 | fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option) -> f64 { 32 | match sample_weight { 33 | Some(sample_weight) => { 34 | let mut ytot: f64 = 0.; 35 | let mut ntot: f64 = 0.; 36 | for i in 0..y.len() { 37 | ytot += sample_weight[i] * y[i]; 38 | ntot += sample_weight[i]; 39 | } 40 | f64::ln(ytot / (ntot - ytot)) 41 | } 42 | None => { 43 | let ytot = fast_sum(y); 44 | let ntot = y.len() as f64; 45 | f64::ln(ytot / (ntot - ytot)) 46 | } 47 | } 48 | } 49 | 50 | #[inline] 51 | fn calc_grad_hess( 52 | y: &[f64], 53 | yhat: &[f64], 54 | sample_weight: Option<&[f64]>, 55 | _quantile: Option, 56 | ) -> (Vec, Option>) { 57 | match sample_weight { 58 | Some(sample_weight) => { 59 | let (g, h) = y 60 | .iter() 61 | .zip(yhat) 62 | .zip(sample_weight) 63 | .map(|((y_, yhat_), w_)| { 64 | let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); 65 | (((yhat_ - *y_) * *w_) as f32, (yhat_ * (f64::ONE - yhat_) * *w_) as f32) 66 | }) 67 | .unzip(); 68 | (g, Some(h)) 69 | } 70 | None => { 71 | let (g, h) = y 72 | .iter() 73 | .zip(yhat) 74 | .map(|(y_, yhat_)| { 75 | let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); 76 | ((yhat_ - *y_) as f32, (yhat_ * (f64::ONE - yhat_)) as f32) 77 | }) 78 | .unzip(); 79 | (g, Some(h)) 80 | } 81 | } 82 | } 83 | 84 | fn default_metric() -> Metric { 85 | Metric::LogLoss 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/objective_functions/mod.rs: -------------------------------------------------------------------------------- 1 | mod adaptive_huber_loss; 2 | mod huber_loss; 3 | mod log_loss; 4 | mod quantile_loss; 5 | mod squared_loss; 6 | 7 | pub use adaptive_huber_loss::AdaptiveHuberLoss; 8 | pub use huber_loss::HuberLoss; 9 | pub use log_loss::LogLoss; 10 | pub use quantile_loss::QuantileLoss; 11 | pub use squared_loss::SquaredLoss; 12 | 13 | use crate::metrics::Metric; 14 | use serde::{Deserialize, Serialize}; 15 | 16 | type ObjFn = fn(&[f64], &[f64], Option<&[f64]>, Option) -> (Vec, Option>); 17 | type LossFn = fn(&[f64], &[f64], Option<&[f64]>, Option) -> Vec; 18 | 19 | #[derive(Debug, Deserialize, Serialize, Clone)] 20 | pub enum Objective { 21 | LogLoss, 22 | SquaredLoss, 23 | QuantileLoss, 24 | AdaptiveHuberLoss, 25 | HuberLoss, 26 | } 27 | 28 | pub fn loss_callables(objective: &Objective) -> LossFn { 29 | match objective { 30 | Objective::LogLoss => LogLoss::calc_loss, 31 | Objective::SquaredLoss => SquaredLoss::calc_loss, 32 | Objective::QuantileLoss => QuantileLoss::calc_loss, 33 | Objective::AdaptiveHuberLoss => AdaptiveHuberLoss::calc_loss, 34 | Objective::HuberLoss => HuberLoss::calc_loss, 35 | } 36 | } 37 | 38 | pub fn gradient_hessian_callables(objective: &Objective) -> ObjFn { 39 | match objective { 40 | Objective::LogLoss => LogLoss::calc_grad_hess, 41 | Objective::SquaredLoss => SquaredLoss::calc_grad_hess, 42 | Objective::QuantileLoss => QuantileLoss::calc_grad_hess, 43 | Objective::AdaptiveHuberLoss => AdaptiveHuberLoss::calc_grad_hess, 44 | Objective::HuberLoss => HuberLoss::calc_grad_hess, 45 | } 46 | } 47 | 48 | pub fn calc_init_callables(objective: &Objective) -> fn(&[f64], Option<&[f64]>, Option) -> f64 { 49 | match objective { 50 | Objective::LogLoss => LogLoss::calc_init, 51 | Objective::SquaredLoss => SquaredLoss::calc_init, 52 | Objective::QuantileLoss => QuantileLoss::calc_init, 53 | Objective::AdaptiveHuberLoss => AdaptiveHuberLoss::calc_init, 54 | Objective::HuberLoss => HuberLoss::calc_init, 55 | } 56 | } 57 | 58 | pub trait ObjectiveFunction { 59 | fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, quantile: Option) -> Vec; 60 | fn calc_grad_hess( 61 | y: &[f64], 62 | yhat: &[f64], 63 | sample_weight: Option<&[f64]>, 64 | quantile: Option, 65 | ) -> (Vec, Option>); 66 | fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, quantile: Option) -> f64; 67 | fn default_metric() -> Metric; 68 | } 69 | 70 | #[cfg(test)] 71 | mod tests { 72 | use super::*; 73 | #[test] 74 | fn test_logloss_loss() { 75 | let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; 76 | let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0]; 77 | let l1 = LogLoss::calc_loss(&y, &yhat1, None, None); 78 | let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0]; 79 | let l2 = LogLoss::calc_loss(&y, &yhat2, None, None); 80 | assert!(l1.iter().sum::() < l2.iter().sum::()); 81 | } 82 | 83 | #[test] 84 | fn test_logloss_grad() { 85 | let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; 86 | let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0]; 87 | let (g1, _) = LogLoss::calc_grad_hess(&y, &yhat1, None, None); 88 | let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0]; 89 | let (g2, _) = LogLoss::calc_grad_hess(&y, &yhat2, None, None); 90 | assert!(g1.iter().sum::() < g2.iter().sum::()); 91 | } 92 | 93 | #[test] 94 | fn test_logloss_init() { 95 | let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; 96 | let l1 = LogLoss::calc_init(&y, None, None); 97 | assert!(l1 == 0.); 98 | 99 | let y = vec![1.0; 6]; 100 | let l2 = LogLoss::calc_init(&y, None, None); 101 | assert!(l2 == f64::INFINITY); 102 | 103 | let y = vec![0.0; 6]; 104 | let l3 = LogLoss::calc_init(&y, None, None); 105 | assert!(l3 == f64::NEG_INFINITY); 106 | 107 | let y = vec![0., 0., 0., 0., 1., 1.]; 108 | let l4 = LogLoss::calc_init(&y, None, None); 109 | assert!(l4 == f64::ln(2. / 4.)); 110 | } 111 | 112 | #[test] 113 | fn test_mse_init() { 114 | let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; 115 | let l1 = SquaredLoss::calc_init(&y, None, None); 116 | assert!(l1 == 0.5); 117 | 118 | let y = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; 119 | let l2 = SquaredLoss::calc_init(&y, None, None); 120 | assert!(l2 == 1.); 121 | 122 | let y = vec![-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]; 123 | let l3 = SquaredLoss::calc_init(&y, None, None); 124 | assert!(l3 == -1.); 125 | 126 | let y = vec![-1.0, -1.0, -1.0, 1., 1., 1.]; 127 | let l4 = SquaredLoss::calc_init(&y, None, None); 128 | assert!(l4 == 0.); 129 | } 130 | 131 | #[test] 132 | fn test_quantile_init() { 133 | let y = vec![1.0, 2.0, 9.0, 3.2, 4.0]; 134 | let w = vec![0.0, 0.5, 1.0, 0.3, 0.5]; 135 | let l1 = QuantileLoss::calc_init(&y, Some(&w), Some(0.1)); 136 | println!("{}", l1); 137 | assert!(l1 == 2.0); 138 | 139 | let y = vec![1.0, 2.0, 9.0, 3.2, 4.0]; 140 | let w = vec![0.0, 0.5, 1.0, 0.3, 0.5]; 141 | let l2 = QuantileLoss::calc_init(&y, Some(&w), Some(0.9)); 142 | println!("{}", l2); 143 | assert!(l2 == 9.0); 144 | } 145 | 146 | #[test] 147 | fn test_adaptive_huberloss_loss() { 148 | let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; 149 | let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0]; 150 | let l1 = AdaptiveHuberLoss::calc_loss(&y, &yhat1, None, Some(0.5)); 151 | let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0]; 152 | let l2 = AdaptiveHuberLoss::calc_loss(&y, &yhat2, None, Some(0.5)); 153 | assert!(l1.iter().sum::() > l2.iter().sum::()); 154 | } 155 | 156 | #[test] 157 | fn test_adaptive_huberloss_grad() { 158 | let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; 159 | let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0]; 160 | let (g1, _) = AdaptiveHuberLoss::calc_grad_hess(&y, &yhat1, None, Some(0.5)); 161 | let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0]; 162 | let (g2, _) = AdaptiveHuberLoss::calc_grad_hess(&y, &yhat2, None, Some(0.5)); 163 | assert!(g1.iter().sum::() < g2.iter().sum::()); 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/objective_functions/quantile_loss.rs: -------------------------------------------------------------------------------- 1 | use super::ObjectiveFunction; 2 | use crate::metrics::Metric; 3 | 4 | #[derive(Default)] 5 | pub struct QuantileLoss {} 6 | 7 | impl ObjectiveFunction for QuantileLoss { 8 | #[inline] 9 | fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, quantile: Option) -> Vec { 10 | match sample_weight { 11 | Some(sample_weight) => y 12 | .iter() 13 | .zip(yhat) 14 | .zip(sample_weight) 15 | .map(|((y_, yhat_), w_)| { 16 | let _quantile = quantile.unwrap(); 17 | let s = *y_ - *yhat_; 18 | let l = if s >= 0.0 { _quantile * s } else { (_quantile - 1.0) * s }; 19 | (l * *w_) as f32 20 | }) 21 | .collect(), 22 | None => y 23 | .iter() 24 | .zip(yhat) 25 | .map(|(y_, yhat_)| { 26 | let _quantile = quantile.unwrap(); 27 | let s = *y_ - *yhat_; 28 | let l = if s >= 0.0 { _quantile * s } else { (_quantile - 1.0) * s }; 29 | l as f32 30 | }) 31 | .collect(), 32 | } 33 | } 34 | 35 | fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, quantile: Option) -> f64 { 36 | match sample_weight { 37 | Some(sample_weight) => { 38 | let mut indices = (0..y.len()).collect::>(); 39 | indices.sort_by(|&a, &b| y[a].total_cmp(&y[b])); 40 | let w_tot: f64 = sample_weight.iter().sum(); 41 | let w_target = w_tot * quantile.unwrap() as f64; 42 | let mut w_cum = 0.0_f64; 43 | let mut init_value = f64::NAN; 44 | for i in indices { 45 | w_cum += sample_weight[i]; 46 | if w_cum >= w_target { 47 | init_value = y[i]; 48 | break; 49 | } 50 | } 51 | init_value 52 | } 53 | None => { 54 | let mut indices = (0..y.len()).collect::>(); 55 | indices.sort_by(|&a, &b| y[a].total_cmp(&y[b])); 56 | let w_tot: f64 = y.len() as f64; 57 | let w_target = w_tot * quantile.unwrap() as f64; 58 | let mut w_cum = 0.0_f64; 59 | let mut init_value = f64::NAN; 60 | for i in indices { 61 | w_cum += 1.0; 62 | if w_cum >= w_target { 63 | init_value = y[i]; 64 | break; 65 | } 66 | } 67 | init_value 68 | } 69 | } 70 | } 71 | 72 | #[inline] 73 | fn calc_grad_hess( 74 | y: &[f64], 75 | yhat: &[f64], 76 | sample_weight: Option<&[f64]>, 77 | quantile: Option, 78 | ) -> (Vec, Option>) { 79 | match sample_weight { 80 | Some(sample_weight) => { 81 | let (g, h) = y 82 | .iter() 83 | .zip(yhat) 84 | .zip(sample_weight) 85 | .map(|((y_, yhat_), w_)| { 86 | let _quantile = quantile.unwrap(); 87 | let delta = yhat_ - *y_; 88 | let g = if delta >= 0.0 { 89 | (1.0 - _quantile) * w_ 90 | } else { 91 | -1.0 * _quantile * w_ 92 | }; 93 | (g as f32, *w_ as f32) 94 | }) 95 | .unzip(); 96 | (g, Some(h)) 97 | } 98 | None => { 99 | let g = y 100 | .iter() 101 | .zip(yhat) 102 | .map(|(y_, yhat_)| { 103 | let _quantile = quantile.unwrap(); 104 | let delta = yhat_ - *y_; 105 | let g = if delta >= 0.0 { 106 | 1.0 - _quantile 107 | } else { 108 | -1.0 * _quantile 109 | }; 110 | g as f32 111 | }) 112 | .collect(); 113 | (g, None) 114 | } 115 | } 116 | } 117 | 118 | fn default_metric() -> Metric { 119 | Metric::QuantileLoss 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/objective_functions/squared_loss.rs: -------------------------------------------------------------------------------- 1 | use super::ObjectiveFunction; 2 | use crate::{metrics::Metric, utils::fast_sum}; 3 | 4 | #[derive(Default)] 5 | pub struct SquaredLoss {} 6 | 7 | impl ObjectiveFunction for SquaredLoss { 8 | #[inline] 9 | fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, _quantile: Option) -> Vec { 10 | match sample_weight { 11 | Some(sample_weight) => y 12 | .iter() 13 | .zip(yhat) 14 | .zip(sample_weight) 15 | .map(|((y_, yhat_), w_)| { 16 | let s = *y_ - *yhat_; 17 | (s * s * *w_) as f32 18 | }) 19 | .collect(), 20 | None => y 21 | .iter() 22 | .zip(yhat) 23 | .map(|(y_, yhat_)| { 24 | let s = *y_ - *yhat_; 25 | (s * s) as f32 26 | }) 27 | .collect(), 28 | } 29 | } 30 | 31 | fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option) -> f64 { 32 | match sample_weight { 33 | Some(sample_weight) => { 34 | let mut ytot: f64 = 0.; 35 | let mut ntot: f64 = 0.; 36 | for i in 0..y.len() { 37 | ytot += sample_weight[i] * y[i]; 38 | ntot += sample_weight[i]; 39 | } 40 | ytot / ntot 41 | } 42 | None => fast_sum(y) / y.len() as f64, 43 | } 44 | } 45 | 46 | #[inline] 47 | fn calc_grad_hess( 48 | y: &[f64], 49 | yhat: &[f64], 50 | sample_weight: Option<&[f64]>, 51 | _quantile: Option, 52 | ) -> (Vec, Option>) { 53 | match sample_weight { 54 | Some(sample_weight) => { 55 | let (g, h) = y 56 | .iter() 57 | .zip(yhat) 58 | .zip(sample_weight) 59 | .map(|((y_, yhat_), w_)| (((yhat_ - *y_) * *w_) as f32, *w_ as f32)) 60 | .unzip(); 61 | (g, Some(h)) 62 | } 63 | None => ( 64 | y.iter().zip(yhat).map(|(y_, yhat_)| (yhat_ - *y_) as f32).collect(), 65 | None, 66 | ), 67 | } 68 | } 69 | 70 | fn default_metric() -> Metric { 71 | Metric::RootMeanSquaredLogError 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/partial_dependence.rs: -------------------------------------------------------------------------------- 1 | use crate::{tree::tree::Tree, utils::is_missing}; 2 | 3 | /// Partial Dependence Calculator 4 | // struct PDCalculator { 5 | // partial_dependence: f32, 6 | // base_score: f64, 7 | // tree_prediction: f64, 8 | 9 | // } 10 | 11 | fn get_node_cover(tree: &Tree, node_idx: usize) -> f32 { 12 | tree.nodes[&node_idx].hessian_sum 13 | } 14 | 15 | pub fn tree_partial_dependence( 16 | tree: &Tree, 17 | node_idx: usize, 18 | feature: usize, 19 | value: f64, 20 | proportion: f32, 21 | missing: &f64, 22 | ) -> f64 { 23 | let n = &tree.nodes[&node_idx]; 24 | if n.is_leaf { 25 | f64::from(proportion * n.weight_value) 26 | } else if n.split_feature == feature { 27 | let child = if is_missing(&value, missing) { 28 | n.missing_node 29 | } else if value < n.split_value { 30 | n.left_child 31 | } else { 32 | n.right_child 33 | }; 34 | tree_partial_dependence(tree, child, feature, value, proportion, missing) 35 | } else { 36 | let left_cover = get_node_cover(tree, n.left_child); 37 | let right_cover = get_node_cover(tree, n.right_child); 38 | let missing_cover = if n.has_missing_branch() { 39 | get_node_cover(tree, n.missing_node) 40 | } else { 41 | 0.0 42 | }; 43 | let total_cover = left_cover + right_cover + missing_cover; 44 | let missing_pd = if n.has_missing_branch() { 45 | tree_partial_dependence( 46 | tree, 47 | n.missing_node, 48 | feature, 49 | value, 50 | proportion * (missing_cover / total_cover), 51 | missing, 52 | ) 53 | } else { 54 | 0. 55 | }; 56 | tree_partial_dependence( 57 | tree, 58 | n.left_child, 59 | feature, 60 | value, 61 | proportion * (left_cover / total_cover), 62 | missing, 63 | ) + tree_partial_dependence( 64 | tree, 65 | n.right_child, 66 | feature, 67 | value, 68 | proportion * (right_cover / total_cover), 69 | missing, 70 | ) + missing_pd 71 | } 72 | } 73 | 74 | #[cfg(test)] 75 | mod tests { 76 | 77 | use super::*; 78 | use crate::binning::bin_matrix; 79 | use crate::constraints::ConstraintMap; 80 | use crate::data::Matrix; 81 | use crate::histogram::{NodeHistogram, NodeHistogramOwned}; 82 | use crate::objective_functions::{LogLoss, ObjectiveFunction}; 83 | use crate::splitter::{MissingImputerSplitter, SplitInfo, SplitInfoSlice}; 84 | use crate::tree::tree::Tree; 85 | use std::fs; 86 | 87 | #[test] 88 | fn test_partial_dependence() { 89 | let is_const_hess = false; 90 | 91 | let file = 92 | fs::read_to_string("resources/contiguous_no_missing.csv").expect("Something went wrong reading the file"); 93 | let data_vec: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); 94 | let file = fs::read_to_string("resources/performance.csv").expect("Something went wrong reading the file"); 95 | let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); 96 | let yhat = vec![0.5; y.len()]; 97 | let (mut g, mut h) = LogLoss::calc_grad_hess(&y, &yhat, None, None); 98 | let loss = LogLoss::calc_loss(&y, &yhat, None, None); 99 | 100 | let data = Matrix::new(&data_vec, 891, 5); 101 | let splitter = MissingImputerSplitter::new(0.3, true, ConstraintMap::new()); 102 | let mut tree = Tree::new(); 103 | 104 | let b = bin_matrix(&data, None, 300, f64::NAN, None).unwrap(); 105 | let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); 106 | let col_index: Vec = (0..data.cols).collect(); 107 | 108 | let n_nodes_alloc = 100; 109 | 110 | let mut hist_tree_owned: Vec = (0..n_nodes_alloc) 111 | .map(|_| NodeHistogramOwned::empty_from_cuts(&b.cuts, &col_index, is_const_hess, true)) 112 | .collect(); 113 | 114 | let mut hist_tree: Vec = hist_tree_owned 115 | .iter_mut() 116 | .map(|node_hist| NodeHistogram::from_owned(node_hist)) 117 | .collect(); 118 | 119 | let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap(); 120 | 121 | let mut split_info_vec: Vec = (0..col_index.len()).map(|_| SplitInfo::default()).collect(); 122 | let split_info_slice = SplitInfoSlice::new(&mut split_info_vec); 123 | 124 | tree.fit( 125 | &bdata, 126 | data.index.to_owned(), 127 | &col_index, 128 | &mut g, 129 | h.as_deref_mut(), 130 | &splitter, 131 | &pool, 132 | Some(f32::MAX), 133 | &loss, 134 | &y, 135 | LogLoss::calc_loss, 136 | &yhat, 137 | None, 138 | None, 139 | false, 140 | &mut hist_tree, 141 | None, 142 | &split_info_slice, 143 | n_nodes_alloc, 144 | ); 145 | 146 | let pdp1 = tree_partial_dependence(&tree, 0, 0, 1.0, 1.0, &f64::NAN); 147 | let pdp2 = tree_partial_dependence(&tree, 0, 0, 2.0, 1.0, &f64::NAN); 148 | let pdp3 = tree_partial_dependence(&tree, 0, 0, 3.0, 1.0, &f64::NAN); 149 | println!("{}, {}, {}", pdp1, pdp2, pdp3); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/sampler.rs: -------------------------------------------------------------------------------- 1 | use rand::rngs::StdRng; 2 | use rand::Rng; 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Serialize, Deserialize)] 6 | pub enum SampleMethod { 7 | None, 8 | Random, 9 | } 10 | 11 | // A sampler can be used to subset the data prior to fitting a new tree. 12 | pub trait Sampler { 13 | /// Sample the data, returning a tuple, where the first item is the samples 14 | /// chosen for training, and the second are the samples excluded. 15 | fn sample(&mut self, rng: &mut StdRng, index: &[usize]) -> (Vec, Vec); 16 | } 17 | 18 | pub struct RandomSampler { 19 | subsample: f32, 20 | } 21 | 22 | impl RandomSampler { 23 | #[allow(dead_code)] 24 | pub fn new(subsample: f32) -> Self { 25 | RandomSampler { subsample } 26 | } 27 | } 28 | 29 | impl Sampler for RandomSampler { 30 | fn sample(&mut self, rng: &mut StdRng, index: &[usize]) -> (Vec, Vec) { 31 | let subsample = self.subsample; 32 | let mut chosen = Vec::new(); 33 | let mut excluded = Vec::new(); 34 | for i in index { 35 | if rng.random_range(0.0..1.0) < subsample { 36 | chosen.push(*i); 37 | } else { 38 | excluded.push(*i) 39 | } 40 | } 41 | (chosen, excluded) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/shapley.rs: -------------------------------------------------------------------------------- 1 | use crate::{node::Node, tree::tree::Tree}; 2 | 3 | #[derive(Debug, Clone, Copy)] 4 | struct PathElement { 5 | feature_index: usize, 6 | zero_fraction: f32, 7 | one_fraction: f32, 8 | pweight: f32, 9 | } 10 | 11 | impl Default for PathElement { 12 | fn default() -> Self { 13 | Self { 14 | feature_index: 0, 15 | zero_fraction: 0., 16 | one_fraction: 0., 17 | pweight: 0., 18 | } 19 | } 20 | } 21 | 22 | #[derive(Debug, Clone, Default)] 23 | struct PathList { 24 | paths: Vec, 25 | } 26 | 27 | impl PathList { 28 | fn get_element(&mut self, i: usize) -> &PathElement { 29 | if i == self.paths.len() { 30 | self.paths.push(PathElement::default()); 31 | &self.paths[i] 32 | } else { 33 | // This will panic for us, if we are out of bounds. 34 | &self.paths[i] 35 | } 36 | } 37 | fn get_element_mut(&mut self, i: usize) -> &mut PathElement { 38 | if i == self.paths.len() { 39 | self.paths.push(PathElement::default()); 40 | &mut self.paths[i] 41 | } else { 42 | // This will panic for us, if we are out of bounds. 43 | &mut self.paths[i] 44 | } 45 | } 46 | // fn with_capacity(capacity: usize) -> PathList { 47 | // PathList { 48 | // paths: Vec::with_capacity(capacity), 49 | // } 50 | // } 51 | // fn with_empty(l: usize) -> PathList { 52 | // PathList { 53 | // paths: vec![PathElement::default(); l], 54 | // } 55 | // } 56 | } 57 | 58 | fn extend_path( 59 | unique_path: &mut PathList, 60 | unique_depth: usize, 61 | zero_fraction: f32, 62 | one_fraction: f32, 63 | feature_index: usize, 64 | ) { 65 | unique_path.get_element_mut(unique_depth).feature_index = feature_index; 66 | unique_path.get_element_mut(unique_depth).zero_fraction = zero_fraction; 67 | unique_path.get_element_mut(unique_depth).one_fraction = one_fraction; 68 | unique_path.get_element_mut(unique_depth).pweight = if unique_depth == 0 { 1.0 } else { 0.0 }; 69 | for i in (0..unique_depth).rev() { 70 | unique_path.get_element_mut(i + 1).pweight += 71 | (one_fraction * unique_path.get_element(i).pweight * (i + 1) as f32) / (unique_depth + 1) as f32; 72 | unique_path.get_element_mut(i).pweight = 73 | (zero_fraction * unique_path.get_element(i).pweight * (unique_depth - i) as f32) 74 | / (unique_depth + 1) as f32; 75 | } 76 | } 77 | 78 | fn unwind_path(unique_path: &mut PathList, unique_depth: usize, path_index: usize) { 79 | let one_fraction = unique_path.get_element(path_index).one_fraction; 80 | let zero_fraction = unique_path.get_element(path_index).zero_fraction; 81 | let mut next_one_portion = unique_path.get_element(unique_depth).pweight; 82 | for i in (0..unique_depth).rev() { 83 | if one_fraction != 0. { 84 | let tmp = unique_path.get_element(i).pweight; 85 | unique_path.get_element_mut(i).pweight = 86 | (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction); 87 | next_one_portion = tmp 88 | - (unique_path.get_element(i).pweight * zero_fraction * (unique_depth - i) as f32) 89 | / (unique_depth + 1) as f32; 90 | } else { 91 | unique_path.get_element_mut(i).pweight = (unique_path.get_element(i).pweight * (unique_depth + 1) as f32) 92 | / (zero_fraction * (unique_depth - i) as f32); 93 | } 94 | } 95 | for i in path_index..unique_depth { 96 | unique_path.get_element_mut(i).feature_index = unique_path.get_element(i + 1).feature_index; 97 | unique_path.get_element_mut(i).zero_fraction = unique_path.get_element(i + 1).zero_fraction; 98 | unique_path.get_element_mut(i).one_fraction = unique_path.get_element(i + 1).one_fraction; 99 | } 100 | } 101 | 102 | fn unwound_path_sum(unique_path: &mut PathList, unique_depth: usize, path_index: usize) -> f32 { 103 | let one_fraction = unique_path.get_element(path_index).one_fraction; 104 | let zero_fraction = unique_path.get_element(path_index).zero_fraction; 105 | let mut next_one_portion = unique_path.get_element(unique_depth).pweight; 106 | let mut total = 0.0; 107 | for i in (0..unique_depth).rev() { 108 | if one_fraction != 0.0 { 109 | let tmp = (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction); 110 | total += tmp; 111 | next_one_portion = unique_path.get_element(i).pweight 112 | - tmp * zero_fraction * ((unique_depth - i) as f32 / (unique_depth + 1) as f32); 113 | } else if zero_fraction != 0.0 { 114 | total += (unique_path.get_element(i).pweight / zero_fraction) 115 | / ((unique_depth - i) as f32 / (unique_depth + 1) as f32); 116 | } else if unique_path.get_element(i).pweight != 0.0 { 117 | panic!("Unique path {} must have zero weight", i); 118 | } 119 | } 120 | total 121 | } 122 | 123 | fn get_hot_cold_children(next_node_idx: usize, node: &Node) -> Vec { 124 | if node.has_missing_branch() { 125 | // we know there will be 3 children if there is a missing branch. 126 | if next_node_idx == node.right_child { 127 | vec![node.right_child, node.left_child, node.missing_node] 128 | } else if next_node_idx == node.left_child { 129 | vec![node.left_child, node.right_child, node.missing_node] 130 | } else { 131 | vec![node.missing_node, node.left_child, node.right_child] 132 | } 133 | } else if next_node_idx == node.right_child { 134 | vec![node.right_child, node.left_child] 135 | } else { 136 | vec![node.left_child, node.right_child] 137 | } 138 | } 139 | 140 | #[allow(clippy::too_many_arguments)] 141 | fn tree_shap( 142 | tree: &Tree, 143 | row: &[f64], 144 | contribs: &mut [f64], 145 | node_index: usize, 146 | mut unique_depth: usize, 147 | mut unique_path: PathList, 148 | parent_zero_fraction: f32, 149 | parent_one_fraction: f32, 150 | parent_feature_index: usize, 151 | missing: &f64, 152 | ) { 153 | let node = &tree.nodes[&node_index]; 154 | extend_path( 155 | &mut unique_path, 156 | unique_depth, 157 | parent_zero_fraction, 158 | parent_one_fraction, 159 | parent_feature_index, 160 | ); 161 | if node.is_leaf { 162 | for i in 1..(unique_depth + 1) { 163 | let w = unwound_path_sum(&mut unique_path, unique_depth, i); 164 | let el = unique_path.get_element(i); 165 | contribs[el.feature_index] += f64::from(w * (el.one_fraction - el.zero_fraction) * node.weight_value); 166 | } 167 | } else { 168 | let next_node_idx = node.get_child_idx(&row[node.split_feature], missing); 169 | let hot_cold_children = get_hot_cold_children(next_node_idx, node); 170 | let mut incoming_zero_fraction = 1.0; 171 | let mut incoming_one_fraction = 1.0; 172 | 173 | let mut path_index = 0; 174 | while path_index <= unique_depth { 175 | if unique_path.get_element(path_index).feature_index == node.split_feature { 176 | break; 177 | } 178 | path_index += 1; 179 | } 180 | 181 | if path_index != (unique_depth + 1) { 182 | incoming_zero_fraction = unique_path.get_element(path_index).zero_fraction; 183 | incoming_one_fraction = unique_path.get_element(path_index).one_fraction; 184 | unwind_path(&mut unique_path, unique_depth, path_index); 185 | unique_depth -= 1; 186 | } 187 | 188 | for (i, n_idx) in hot_cold_children.into_iter().enumerate() { 189 | let zero_fraction = (tree.nodes[&n_idx].hessian_sum / node.hessian_sum) * incoming_zero_fraction; 190 | let onf = if i == 0 { incoming_one_fraction } else { 0. }; 191 | tree_shap( 192 | tree, 193 | row, 194 | contribs, 195 | n_idx, 196 | unique_depth + 1, 197 | unique_path.clone(), 198 | zero_fraction, 199 | onf, 200 | node.split_feature, 201 | missing, 202 | ) 203 | } 204 | } 205 | } 206 | 207 | pub fn predict_contributions_row_shapley(tree: &Tree, row: &[f64], contribs: &mut [f64], missing: &f64) { 208 | contribs[contribs.len() - 1] += tree.get_average_leaf_weights(0); 209 | tree_shap( 210 | tree, 211 | row, 212 | contribs, 213 | 0, 214 | 0, 215 | PathList::default(), 216 | 1., 217 | 1., 218 | row.len() + 100, 219 | missing, 220 | ) 221 | } 222 | -------------------------------------------------------------------------------- /src/tree/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod predict; 2 | pub mod tree; 3 | --------------------------------------------------------------------------------