├── .github
    └── workflows
    │   ├── CI.yml
    │   ├── cargo-build-publish.yml
    │   ├── docs.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Cargo.toml
├── LICENSE
├── README.md
├── benches
    └── perpetual_benchmarks.rs
├── examples
    ├── cal_housing.rs
    ├── cover_types.rs
    └── titanic.rs
├── python-package
    ├── .gitignore
    ├── Cargo.toml
    ├── docs
    │   └── index.md
    ├── examples
    │   ├── benchmark_lgbm.py
    │   ├── benchmark_perpetual.py
    │   ├── categorical_data.ipynb
    │   ├── categorical_data_diamonds.ipynb
    │   ├── categorical_data_titanic.ipynb
    │   ├── fetch_openml.ipynb
    │   ├── lgbm_openml_sensory.ipynb
    │   ├── openml.ipynb
    │   ├── openml_mnist.ipynb
    │   ├── performance_benchmark.ipynb
    │   ├── santander.ipynb
    │   └── toy_datasets.ipynb
    ├── mkdocs.yml
    ├── pyproject.toml
    ├── python
    │   └── perpetual
    │   │   ├── __init__.py
    │   │   ├── booster.py
    │   │   ├── data.py
    │   │   ├── serialize.py
    │   │   ├── types.py
    │   │   └── utils.py
    ├── src
    │   ├── booster.rs
    │   ├── lib.rs
    │   ├── multi_output.rs
    │   └── utils.rs
    ├── tests
    │   ├── test_booster.py
    │   ├── test_multi_output.py
    │   ├── test_save_load.py
    │   └── test_serialize.py
    └── uv.lock
├── resources
    └── perp_logo.png
├── rust-toolchain
├── rustfmt.toml
├── scripts
    ├── make_resources.py
    ├── remove-optional-deps.py
    ├── run-python-tests.ps1
    ├── run-python-tests.sh
    ├── run-single-python-test.ps1
    ├── uv_script.ps1
    └── uv_script.sh
└── src
    ├── bin.rs
    ├── binning.rs
    ├── booster
        ├── booster.rs
        ├── mod.rs
        ├── multi_output.rs
        ├── predict.rs
        └── setters.rs
    ├── conformal
        ├── cqr.rs
        └── mod.rs
    ├── constants.rs
    ├── constraints.rs
    ├── data.rs
    ├── errors.rs
    ├── grower.rs
    ├── histogram.rs
    ├── lib.rs
    ├── metrics
        ├── classification
        │   ├── metrics.rs
        │   └── mod.rs
        ├── mod.rs
        └── regression
        │   ├── metrics.rs
        │   └── mod.rs
    ├── node.rs
    ├── objective_functions
        ├── adaptive_huber_loss.rs
        ├── huber_loss.rs
        ├── log_loss.rs
        ├── mod.rs
        ├── quantile_loss.rs
        └── squared_loss.rs
    ├── partial_dependence.rs
    ├── prune.rs
    ├── sampler.rs
    ├── shapley.rs
    ├── splitter.rs
    ├── tree
        ├── mod.rs
        ├── predict.rs
        └── tree.rs
    └── utils.rs


/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
  1 | name: Test and Deploy
  2 | on: [pull_request]
  3 | 
  4 | jobs:
  5 |   windows-build-test:
  6 |     strategy:
  7 |       matrix:
  8 |         pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"]
  9 |     runs-on: "windows-latest"
 10 |     steps:
 11 |       - uses: actions/checkout@v4
 12 |       - name: Install latests stable Rust
 13 |         uses: dtolnay/rust-toolchain@stable
 14 |         with:
 15 |           toolchain: stable
 16 |       - uses: actions/setup-python@v5
 17 |         with:
 18 |           python-version: ${{ matrix.pyversion }}
 19 |           architecture: x64
 20 |       - name: Install deps
 21 |         run: pip install numpy pandas seaborn scikit-learn toml
 22 |       - run: |
 23 |           cp README.md python-package/README.md
 24 |           cp LICENSE python-package/LICENSE
 25 |       - name: Build test data
 26 |         run: |
 27 |           cd python-package
 28 |           python -m pip install -e .[dev]
 29 |           cd ..
 30 |           python scripts/make_resources.py
 31 |       - name: Build wheels with maturin
 32 |         uses: PyO3/maturin-action@v1
 33 |         with:
 34 |           target: x86_64
 35 |           command: build
 36 |           args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist
 37 |       - name: Install wheel
 38 |         run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall
 39 |       - name: Run Package Tests
 40 |         run: |
 41 |           pip install pytest pytest-cov black ruff setuptools --upgrade
 42 |           cd python-package
 43 |           ruff check .
 44 |           black --check .
 45 |           pytest --cov-fail-under=90 tests
 46 |           cd ..
 47 |       - name: Save Artifacts
 48 |         uses: actions/upload-artifact@v4
 49 |         with:
 50 |           name: dist-windows-${{ matrix.pyversion }}
 51 |           path: dist
 52 | 
 53 |   macos-build-test:
 54 |     strategy:
 55 |       matrix:
 56 |         pyversion: ["3.11", "3.12", "3.13"]
 57 |         os: [macos-latest, macos-latest-large]
 58 |     runs-on: ${{ matrix.os }}
 59 |     steps:
 60 |       - uses: actions/checkout@v4
 61 |       - name: Install latest stable Rust
 62 |         uses: dtolnay/rust-toolchain@stable
 63 |         with:
 64 |           toolchain: stable
 65 |       - uses: actions/setup-python@v5
 66 |         with:
 67 |           python-version: ${{ matrix.pyversion }}
 68 |       - name: Install deps
 69 |         run: pip install numpy pandas seaborn scikit-learn toml
 70 |       - run: |
 71 |           cp README.md python-package/README.md
 72 |           cp LICENSE python-package/LICENSE
 73 |       - name: Build test data
 74 |         run: |
 75 |           cd python-package
 76 |           python -m pip install -e .[dev]
 77 |           cd ..
 78 |           python scripts/make_resources.py
 79 |       - name: Build wheels with maturin
 80 |         uses: PyO3/maturin-action@v1
 81 |         with:
 82 |           command: build
 83 |           args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist
 84 |       - name: Install wheel
 85 |         run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall
 86 |       - name: Run Package Tests
 87 |         run: |
 88 |           pip install pytest pytest-cov black ruff setuptools --upgrade
 89 |           cd python-package
 90 |           ruff check .
 91 |           black --check .
 92 |           pytest --cov-fail-under=90 tests
 93 |           cd ..
 94 |       - name: Save Artifacts
 95 |         uses: actions/upload-artifact@v4
 96 |         with:
 97 |           name: dist-${{ matrix.os }}-${{ matrix.pyversion }}
 98 |           path: dist
 99 | 
100 |   linux-build-test:
101 |     runs-on: ubuntu-latest
102 |     strategy:
103 |       matrix:
104 |         pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"]
105 |     steps:
106 |       - uses: actions/checkout@v4
107 |       - name: Install latests stable Rust
108 |         uses: dtolnay/rust-toolchain@stable
109 |         with:
110 |           toolchain: stable
111 |       - uses: actions/setup-python@v5
112 |         with:
113 |           python-version: ${{ matrix.pyversion }}
114 |           architecture: x64
115 |       - name: Install deps
116 |         run: pip install numpy pandas seaborn scikit-learn toml
117 |       - run: |
118 |           cp README.md python-package/README.md
119 |           cp LICENSE python-package/LICENSE
120 |       - name: Build test data
121 |         run: |
122 |           cd python-package
123 |           python -m pip install -e .[dev]
124 |           cd ..
125 |           python scripts/make_resources.py
126 |       - name: Build wheels with maturin
127 |         uses: PyO3/maturin-action@v1
128 |         with:
129 |           target: x86_64
130 |           manylinux: auto
131 |           command: build
132 |           args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist
133 |       - name: Install wheel
134 |         run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall
135 |       - name: Run Package Tests
136 |         run: |
137 |           pip install pytest pytest-cov black ruff setuptools --upgrade
138 |           cd python-package
139 |           ruff check .
140 |           black --check .
141 |           pytest --cov-fail-under=90 tests
142 |           cd ..
143 |       - name: Save Artifacts
144 |         uses: actions/upload-artifact@v4
145 |         with:
146 |           name: dist-linux-${{ matrix.pyversion }}
147 |           path: dist
148 | 
149 |   linux-arm-build-test:
150 |     runs-on: ubuntu-24.04-arm
151 |     strategy:
152 |       matrix:
153 |         pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"]
154 |     steps:
155 |       - uses: actions/checkout@v4
156 |       - name: Install latests stable Rust
157 |         uses: dtolnay/rust-toolchain@stable
158 |         with:
159 |           toolchain: stable
160 |       - uses: actions/setup-python@v5
161 |         with:
162 |           python-version: ${{ matrix.pyversion }}
163 |           architecture: arm64
164 |       - name: Install deps
165 |         run: pip install numpy pandas seaborn scikit-learn toml
166 |       - run: |
167 |           cp README.md python-package/README.md
168 |           cp LICENSE python-package/LICENSE
169 |       - name: Build test data
170 |         run: |
171 |           cd python-package
172 |           python -m pip install -e .[dev]
173 |           cd ..
174 |           python scripts/make_resources.py
175 |       - name: Build wheels with maturin
176 |         uses: PyO3/maturin-action@v1
177 |         with:
178 |           manylinux: auto
179 |           command: build
180 |           args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist
181 |       - name: Install wheel
182 |         run: pip install perpetual --no-index --find-links dist --no-deps --force-reinstall
183 |       - name: Run Package Tests
184 |         run: |
185 |           pip install pytest pytest-cov black ruff setuptools --upgrade
186 |           cd python-package
187 |           ruff check .
188 |           black --check .
189 |           pytest --cov-fail-under=90 tests
190 |           cd ..
191 |       - name: Save Artifacts
192 |         uses: actions/upload-artifact@v4
193 |         with:
194 |           name: dist-linux-arm-${{ matrix.pyversion }}
195 |           path: dist
196 | 
197 |   cargo-build-test:
198 |     runs-on: ubuntu-latest
199 |     steps:
200 |       - uses: actions/checkout@v4
201 |       - name: Install latest stable Rust
202 |         uses: dtolnay/rust-toolchain@stable
203 |         with:
204 |           toolchain: stable
205 |       - uses: actions/setup-python@v5
206 |         with:
207 |           python-version: "3.11"
208 |           architecture: x64
209 |       - name: Install deps
210 |         run: pip install numpy pandas seaborn scikit-learn toml
211 |       - run: |
212 |           cp README.md python-package/README.md
213 |           cp LICENSE python-package/LICENSE
214 |       - name: Build test data
215 |         run: |
216 |           cd python-package
217 |           python -m pip install -e .[dev]
218 |           cd ..
219 |           python scripts/make_resources.py
220 |       - name: Run tests
221 |         run: cargo test --verbose
222 | 


--------------------------------------------------------------------------------
/.github/workflows/cargo-build-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Cargo Build Publish
 2 | on: [workflow_dispatch]
 3 | 
 4 | jobs:
 5 |   cargo-build-test:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v3
 9 |       - name: Install latests stable Rust
10 |         uses: dtolnay/rust-toolchain@stable
11 |         with:
12 |           toolchain: stable
13 |       - uses: actions/setup-python@v4
14 |         with:
15 |           python-version: "3.10"
16 |           architecture: x64
17 |       - name: Install deps
18 |         run: pip install numpy pandas seaborn scikit-learn toml
19 |       - run: |
20 |           cp README.md python-package/README.md
21 |           cp LICENSE python-package/LICENSE
22 |       - name: Update TOML
23 |         run: python scripts/remove-optional-deps.py
24 |       - name: Build test data
25 |         run: python scripts/make_resources.py
26 |       - name: Run tests
27 |         run: cargo test --verbose
28 |       - name: Publish Crate
29 |         run: cargo publish --token ${CRATES_TOKEN} --allow-dirty
30 |         env:
31 |           CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }}
32 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | 
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |       - uses: actions/setup-python@v4
14 |         with:
15 |           python-version: 3.x
16 |       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
17 |       - uses: actions/cache@v3
18 |         with:
19 |           key: mkdocs-material-${{ env.cache_id }}
20 |           path: .cache
21 |           restore-keys: |
22 |             mkdocs-material-
23 |       - run: |
24 |           cp README.md python-package/README.md
25 |           cp LICENSE python-package/LICENSE
26 |       - run: pip install mkdocs-material
27 |       - run: |
28 |           cd python-package
29 |           pip install .[dev]
30 |           mkdocs gh-deploy --force
31 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Publish
  2 | on:
  3 |   release:
  4 |     types: [published]
  5 | 
  6 | jobs:
  7 |   windows-build:
  8 |     strategy:
  9 |       matrix:
 10 |         pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 11 |     runs-on: "windows-latest"
 12 |     steps:
 13 |       - uses: actions/checkout@v4
 14 |       - name: Install latests stable Rust
 15 |         uses: dtolnay/rust-toolchain@stable
 16 |         with:
 17 |           toolchain: stable
 18 |       - uses: actions/setup-python@v5
 19 |         with:
 20 |           python-version: ${{ matrix.pyversion }}
 21 |           architecture: x64
 22 |       - name: Install deps
 23 |         run: pip install numpy pandas seaborn scikit-learn toml
 24 |       - run: |
 25 |           cp README.md python-package/README.md
 26 |           cp LICENSE python-package/LICENSE
 27 |       - name: Build wheels with maturin
 28 |         uses: PyO3/maturin-action@v1
 29 |         with:
 30 |           target: x86_64
 31 |           command: build
 32 |           args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist
 33 |       - name: Save Artifacts
 34 |         uses: actions/upload-artifact@v4
 35 |         with:
 36 |           name: dist-windows-${{ matrix.pyversion }}
 37 |           path: dist
 38 | 
 39 |   macos-build:
 40 |     strategy:
 41 |       matrix:
 42 |         pyversion: ["3.11", "3.12", "3.13"]
 43 |         os: [macos-latest, macos-latest-large]
 44 |     runs-on: ${{ matrix.os }}
 45 |     steps:
 46 |       - uses: actions/checkout@v4
 47 |       - name: Install latest stable Rust
 48 |         uses: dtolnay/rust-toolchain@stable
 49 |         with:
 50 |           toolchain: stable
 51 |       - uses: actions/setup-python@v5
 52 |         with:
 53 |           python-version: ${{ matrix.pyversion }}
 54 |       - name: Install deps
 55 |         run: pip install numpy pandas seaborn scikit-learn toml
 56 |       - run: |
 57 |           cp README.md python-package/README.md
 58 |           cp LICENSE python-package/LICENSE
 59 |       - name: Build wheels with maturin
 60 |         uses: PyO3/maturin-action@v1
 61 |         with:
 62 |           command: build
 63 |           args: --release --strip --interpreter python --manifest-path python-package/Cargo.toml --out dist --sdist
 64 |       - name: Save Artifacts
 65 |         uses: actions/upload-artifact@v4
 66 |         with:
 67 |           name: dist-${{ matrix.os }}-${{ matrix.pyversion }}
 68 |           path: dist
 69 | 
 70 |   linux-build:
 71 |     runs-on: ubuntu-latest
 72 |     strategy:
 73 |       matrix:
 74 |         pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 75 |     steps:
 76 |       - uses: actions/checkout@v4
 77 |       - name: Install latests stable Rust
 78 |         uses: dtolnay/rust-toolchain@stable
 79 |         with:
 80 |           toolchain: stable
 81 |       - uses: actions/setup-python@v5
 82 |         with:
 83 |           python-version: ${{ matrix.pyversion }}
 84 |           architecture: x64
 85 |       - name: Install deps
 86 |         run: pip install numpy pandas seaborn scikit-learn toml
 87 |       - run: |
 88 |           cp README.md python-package/README.md
 89 |           cp LICENSE python-package/LICENSE
 90 |       - name: Build wheels with maturin
 91 |         uses: PyO3/maturin-action@v1
 92 |         with:
 93 |           target: x86_64
 94 |           manylinux: auto
 95 |           command: build
 96 |           args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist
 97 |       - name: Save Artifacts
 98 |         uses: actions/upload-artifact@v4
 99 |         with:
100 |           name: dist-linux-${{ matrix.pyversion }}
101 |           path: dist
102 | 
103 |   linux-arm-build:
104 |     runs-on: ubuntu-24.04-arm
105 |     strategy:
106 |       matrix:
107 |         pyversion: ["3.9", "3.10", "3.11", "3.12", "3.13"]
108 |     steps:
109 |       - uses: actions/checkout@v4
110 |       - name: Install latests stable Rust
111 |         uses: dtolnay/rust-toolchain@stable
112 |         with:
113 |           toolchain: stable
114 |       - uses: actions/setup-python@v5
115 |         with:
116 |           python-version: ${{ matrix.pyversion }}
117 |           architecture: arm64
118 |       - name: Install deps
119 |         run: pip install numpy pandas seaborn scikit-learn toml
120 |       - run: |
121 |           cp README.md python-package/README.md
122 |           cp LICENSE python-package/LICENSE
123 |       - name: Build wheels with maturin
124 |         uses: PyO3/maturin-action@v1
125 |         with:
126 |           manylinux: auto
127 |           command: build
128 |           args: --release --strip --interpreter python${{ matrix.pyversion }} --manifest-path python-package/Cargo.toml --out dist --sdist
129 |       - name: Save Artifacts
130 |         uses: actions/upload-artifact@v4
131 |         with:
132 |           name: dist-linux-arm-${{ matrix.pyversion }}
133 |           path: dist
134 | 
135 |   cargo-publish:
136 |     runs-on: ubuntu-latest
137 |     steps:
138 |       - uses: actions/checkout@v4
139 |       - name: Install latest stable Rust
140 |         uses: dtolnay/rust-toolchain@stable
141 |         with:
142 |           toolchain: stable
143 |       - uses: actions/setup-python@v5
144 |         with:
145 |           python-version: "3.11"
146 |           architecture: x64
147 |       - name: Install deps
148 |         run: pip install numpy pandas seaborn scikit-learn toml
149 |       - run: |
150 |           cp README.md python-package/README.md
151 |           cp LICENSE python-package/LICENSE
152 |       - name: Publish Crate
153 |         run: cargo publish --token ${CRATES_TOKEN} --allow-dirty
154 |         env:
155 |           CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }}
156 | 
157 |   pypi-publish:
158 |     runs-on: ubuntu-latest
159 |     needs: ["windows-build", "macos-build", "linux-build", "linux-arm-build"]
160 |     environment:
161 |       name: Test and Deploy
162 |       url: https://pypi.org/p/perpetual
163 |     permissions:
164 |       id-token: write
165 |     steps:
166 |       - name: Retrieve release distributions
167 |         uses: actions/download-artifact@v4
168 |         with:
169 |           pattern: dist-*
170 |           merge-multiple: true
171 |           path: dist
172 |       - name: Publish release distributions to PyPI
173 |         uses: pypa/gh-action-pypi-publish@release/v1
174 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | Cargo.lock
 3 | perpetual/__pycache__
 4 | .vscode/
 5 | .venv
 6 | resources/*
 7 | !resources/perp_logo.png
 8 | python-package/Cargo.lock
 9 | python-package/LICENSE
10 | python-package/README.md
11 | python-package/target
12 | python-package/python/perpetual/__pycache__


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/doublify/pre-commit-rust
 3 |     rev: "v1.0"
 4 |     hooks:
 5 |     -   id: fmt
 6 |     -   id: cargo-check
 7 |     -   id: clippy
 8 | -   repo: https://github.com/pycqa/isort
 9 |     rev: "5.12.0"
10 |     hooks:
11 |     -   id: isort
12 | -   repo: https://github.com/psf/black
13 |     rev: "22.6.0"
14 |     hooks:
15 |     -   id: black
16 | - repo: https://github.com/astral-sh/ruff-pre-commit
17 |   # Ruff version.
18 |   rev: v0.0.277
19 |   hooks:
20 |     - id: ruff


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to `perpetual`
  2 | 
  3 | ## Development Setup
  4 | 
  5 | For development, it is assumed you have stable rust installed, and at least python 3.9. Additionally, in your python environment, you will need to install [`maturin`](https://github.com/PyO3/maturin).
  6 | 
  7 | ### To run Rust tests
  8 | 
  9 | The rust test depend on some artifacts that are generated from a python script. You can either follow the steps in the python tests section, or run the following in an environment that is running python.
 10 | 
 11 | ```sh
 12 | cd python-package
 13 | # Install the project in editable mode and all development dependencies
 14 | python -m pip install -e .[dev]
 15 | # You can now return to the rood directory and run the tests...
 16 | cd ..
 17 | python -m pip install pandas seaborn
 18 | python scripts/make_resources.py
 19 | ```
 20 | 
 21 | If you have rust and the cargo package manager installed all you need to do to run the rust tests is in the root of the repository run the following command.
 22 | 
 23 | ```sh
 24 | cargo test
 25 | ```
 26 | 
 27 | ### To run the python tests
 28 | 
 29 | Prior to running the tests, you should install `python-package` in editable mode. To do this, from the project root directory you can run the following.
 30 | 
 31 | ```sh
 32 | cd python-package
 33 | # Install the project in editable mode and all development dependencies
 34 | python -m pip install -e .[dev]
 35 | # You can now return to the root directory and run the tests...
 36 | cd ..
 37 | 
 38 | # Prior to running the tests, build all required test artifacts
 39 | python scripts/make_resources.py
 40 | 
 41 | # Now you can run the tests.
 42 | # on Linux...
 43 | source scripts/run-python-tests.sh
 44 | ```
 45 | 
 46 | The test script can also be run from powershell.
 47 | 
 48 | ```powershell
 49 | # on Windows (powershell)
 50 | .\scripts\run-python-tests.ps1
 51 | ```
 52 | 
 53 | This script, builds the package in release mode, installs it, and then runs the test. Because of this, it is useful to run this whenever you want to test out a change in the python package.
 54 | 
 55 | ## Benchmarking
 56 | 
 57 | Benchmarking is run using the [`criterion`](https://github.com/bheisler/criterion.rs) Rust crate.
 58 | To run the benchmarks, you can run the following command from your terminal.
 59 | 
 60 | ```sh
 61 | cargo bench
 62 | ```
 63 | 
 64 | specific benchmarks can be targeted by referring to them by name.
 65 | 
 66 | ```sh
 67 | cargo bench "fast sum"
 68 | ```
 69 | 
 70 | ## Pre-commit
 71 | 
 72 | The [`pre-commit`](https://pre-commit.com/) framework should be installed and used to ensure all commits meet the required formatting, and linting checks prior to a commit being made to the repository.
 73 | 
 74 | ```sh
 75 | # Install pre-commit, either right in your default python install
 76 | # or using a tool such as pipx (https://pypa.github.io/pipx/)
 77 | python -m pip install pre-commit
 78 | 
 79 | # In the root of the repository
 80 | pre-commit install
 81 | ```
 82 | 
 83 | ## Serialization
 84 | 
 85 | The saving and loading of the model is all handled by the [`serde`](https://docs.rs/serde/1.0.163/serde/) and [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) crates.
 86 | 
 87 | Because of this you will see the following attribue calls sprinkled throughout the package.
 88 | 
 89 | ```rust
 90 | #[derive(Deserialize, Serialize)]
 91 | ```
 92 | 
 93 | Additionally in order to not break backwards compatibility with models saved in previous versions, any new items added to the `Tree` or `PerpetualBooster` struts, should have a default value defined. This way models can be loaded, even if they were saved before the new fields was added.
 94 | A default value can be added for a fields using the `#[#[serde(default = "default_sample_method")]]` attribute. Where the string that default is referring to must be the name of a valid function, the following is a complete example of this.
 95 | 
 96 | ```rust
 97 | use crate::sampler::SampleMethod, Sampler;
 98 | 
 99 | #[derive(Deserialize, Serialize)]
100 | pub struct PerpetualBooster {
101 |     // ...
102 |     #[serde(default = "default_sample_method")]
103 |     pub sample_method: SampleMethod,
104 |     // ...
105 | }
106 | 
107 | fn default_sample_method() -> SampleMethod {
108 |     SampleMethod::None
109 | }
110 | ```
111 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "perpetual"
 3 | version = "0.9.3"
 4 | edition = "2021"
 5 | authors = ["Mutlu Simsek <msimsek@perpetual-ml.com>"]
 6 | homepage = "https://perpetual-ml.com"
 7 | description = "A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization"
 8 | license-file = "LICENSE"
 9 | readme = "README.md"
10 | repository = "https://github.com/perpetual-ml/perpetual"
11 | 
12 | keywords = ["machine-learning", "perpetual", "ai", "ml"]
13 | categories = ["algorithms", "mathematics", "science"]
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | [profile.release]
17 | lto = 'fat'
18 | codegen-units = 1
19 | #debug = true      # due to flamegraph
20 | #strip = false     # due to flamegraph
21 | 
22 | [dependencies]
23 | rayon = "1.10.0"
24 | thiserror = "2.0.12"
25 | serde_json = { version = "1.0.140", features = ["float_roundtrip"] }
26 | serde = { version = "1.0.219", features = ["derive"] }
27 | approx = "0.5.1"
28 | log = "0.4.27"
29 | rand = "0.9.0"
30 | sysinfo = "0.33.1"
31 | 
32 | [dev-dependencies]
33 | criterion = "0.5.1"
34 | polars = "0.41"
35 | reqwest = { version = "0.12.14", features = ["blocking"] }
36 | csv = "1.3.1"
37 | chrono = "0.4.40"
38 | 
39 | [[bench]]
40 | name = "perpetual_benchmarks"
41 | harness = false
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img  height="120" src="https://github.com/perpetual-ml/perpetual/raw/main/resources/perp_logo.png">
  3 | </p>
  4 | 
  5 | <div align="center">
  6 | 
  7 | [![Python Versions](https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white)](https://pypi.org/project/perpetual)
  8 | [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual)
  9 | [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual)
 10 | [![Static Badge](https://img.shields.io/badge/join-discord-blue?logo=discord)](https://discord.gg/AyUK7rr6wy)
 11 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/perpetual)
 12 | 
 13 | 
 14 | </div>
 15 | 
 16 | # Perpetual
 17 | 
 18 | PerpetualBooster is a gradient boosting machine (GBM) algorithm that doesn't need hyperparameter optimization unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 0.5) and increase it (e.g. 1.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
 19 | 
 20 | ## Usage
 21 | 
 22 | You can use the algorithm like in the example below. Check examples folders for both Rust and Python.
 23 | 
 24 | ```python
 25 | from perpetual import PerpetualBooster
 26 | 
 27 | model = PerpetualBooster(objective="SquaredLoss", budget=0.5)
 28 | model.fit(X, y)
 29 | ```
 30 | 
 31 | ## Documentation
 32 | 
 33 | Documentation for the Python API can be found [here](https://perpetual-ml.github.io/perpetual) and for the Rust API [here](https://docs.rs/perpetual/latest/perpetual/).
 34 | 
 35 | 
 36 | ## Benchmark
 37 | 
 38 | ### PerpetualBooster vs. Optuna + LightGBM
 39 | 
 40 | Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
 41 | 
 42 | The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression):
 43 | 
 44 | | Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time |
 45 | | ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- |
 46 | | 1.0              | 100                   | 0.192         | 0.192        | 54x                | 56x               |
 47 | | 1.5              | 300                   | 0.188         | 0.188        | 59x                | 58x               |
 48 | | 2.1              | 1000                  | 0.185         | 0.186        | 42x                | 41x               |
 49 | 
 50 | The following table summarizes the results for the [Cover Types](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html) dataset (classification):
 51 | 
 52 | | Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Speed-up wall time | Speed-up cpu time |
 53 | | ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- |
 54 | | 0.9              | 100                   | 0.091              | 0.084             | 72x                | 78x               |
 55 | 
 56 | The results can be reproduced using the scripts in the [examples](./python-package/examples) folder.
 57 | 
 58 | ### PerpetualBooster vs. AutoGluon
 59 | 
 60 | PerpetualBooster is a GBM but behaves like AutoML so it is benchmarked also against AutoGluon (v1.2, best quality preset), the current leader in [AutoML benchmark](https://automlbenchmark.streamlit.app/cd_diagram). Top 10 datasets with the most number of rows are selected from [OpenML datasets](https://www.openml.org/) for both regression and classification tasks. 
 61 | 
 62 | The results are summarized in the following table for regression tasks:
 63 | 
 64 | | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual RMSE | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon RMSE |
 65 | | -------------------------------------------------------- | ----- | ----- | ------------------- | -------- | ------ | ------------------ |
 66 | | [Airlines_DepDelay_10M](https://www.openml.org/t/359929) | 518   | 11.3  | 29.0                | 520      | 30.9   | <ins> 28.8 </ins>  |
 67 | | [bates_regr_100](https://www.openml.org/t/361940)        | 3421  | 15.1  | <ins> 1.084 </ins>  | OOM      | OOM    | OOM                |
 68 | | [BNG(libras_move)](https://www.openml.org/t/7327)        | 1956  | 4.2   | <ins> 2.51 </ins>   | 1922     | 97.6   | 2.53               |
 69 | | [BNG(satellite_image)](https://www.openml.org/t/7326)    | 334   | 1.6   | 0.731               | 337      | 10.0   | <ins> 0.721 </ins> |
 70 | | [COMET_MC](https://www.openml.org/t/14949)               | 44    | 1.0   | <ins> 0.0615 </ins> | 47       | 5.0    | 0.0662             |
 71 | | [friedman1](https://www.openml.org/t/361939)             | 275   | 4.2   | <ins> 1.047 </ins>  | 278      | 5.1    | 1.487              |
 72 | | [poker](https://www.openml.org/t/10102)                  | 38    | 0.6   | <ins> 0.256 </ins>  | 41       | 1.2    | 0.722              |
 73 | | [subset_higgs](https://www.openml.org/t/361955)          | 868   | 10.6  | <ins> 0.420 </ins>  | 870      | 24.5   | 0.421              |
 74 | | [BNG(autoHorse)](https://www.openml.org/t/7319)          | 107   | 1.1   | <ins> 19.0 </ins>   | 107      | 3.2    | 20.5               |
 75 | | [BNG(pbc)](https://www.openml.org/t/7318)                | 48    | 0.6   | <ins> 836.5 </ins>  | 51       | 0.2    | 957.1              |
 76 | | average                                                  | 465   | 3.9   | -                   | 464      | 19.7   | -                  |
 77 | 
 78 | PerpetualBooster outperformed AutoGluon on 8 out of 10 regression tasks, training equally fast and inferring 5.1x faster. 
 79 | 
 80 | The results are summarized in the following table for classification tasks:
 81 | 
 82 | | OpenML Task | Perpetual Training Duration | Perpetual Inference Duration | Perpetual AUC | AutoGluon Training Duration | AutoGluon Inference Duration | AutoGluon AUC |
 83 | | -------------------------------------------------------- | ------- | ------ | ------------------- | -------- | ------ | ------------------ |
 84 | | [BNG(spambase)](https://www.openml.org/t/146163)         | 70.1    | 2.1   | <ins> 0.671 </ins> | 73.1     | 3.7    | 0.669              |
 85 | | [BNG(trains)](https://www.openml.org/t/208)              | 89.5    | 1.7   | <ins> 0.996 </ins> | 106.4    | 2.4    | 0.994              |
 86 | | [breast](https://www.openml.org/t/361942)                | 13699.3 | 97.7  | <ins> 0.991 </ins> | 13330.7  | 79.7   | 0.949              |
 87 | | [Click_prediction_small](https://www.openml.org/t/7291)  | 89.1    | 1.0   | <ins> 0.749 </ins> | 101.0    | 2.8    | 0.703              |
 88 | | [colon](https://www.openml.org/t/361938)                 | 12435.2 | 126.7 | <ins> 0.997 </ins> | 12356.2  | 152.3  | 0.997              |
 89 | | [Higgs](https://www.openml.org/t/362113)                 | 3485.3  | 40.9  | <ins> 0.843 </ins> | 3501.4   | 67.9   | 0.816              |
 90 | | [SEA(50000)](https://www.openml.org/t/230)               | 21.9    | 0.2   | <ins> 0.936 </ins> | 25.6     | 0.5    | 0.935              |
 91 | | [sf-police-incidents](https://www.openml.org/t/359994)   | 85.8    | 1.5   | <ins> 0.687 </ins> | 99.4     | 2.8    | 0.659              |
 92 | | [bates_classif_100](https://www.openml.org/t/361941)     | 11152.8 | 50.0  | <ins> 0.864 </ins> | OOM      | OOM    | OOM                |
 93 | | [prostate](https://www.openml.org/t/361945)              | 13699.9 | 79.8  | <ins> 0.987 </ins> | OOM      | OOM    | OOM                |
 94 | | average                                                  | 3747.0  | 34.0  | -                  | 3699.2   | 39.0   | -                  |
 95 | 
 96 | PerpetualBooster outperformed AutoGluon on 10 out of 10 classification tasks, training equally fast and inferring 1.1x faster. 
 97 | 
 98 | PerpetualBooster demonstrates greater robustness compared to AutoGluon, successfully training on all 20 tasks, whereas AutoGluon encountered out-of-memory errors on 3 of those tasks.
 99 | 
100 | The results can be reproduced using the automlbenchmark fork [here](https://github.com/deadsoul44/automlbenchmark).
101 | 
102 | 
103 | 
104 | ## Installation
105 | 
106 | The package can be installed directly from [pypi](https://pypi.org/project/perpetual):
107 | 
108 | ```shell
109 | pip install perpetual
110 | ```
111 | 
112 | Using [conda-forge](https://anaconda.org/conda-forge/perpetual):
113 | 
114 | ```shell
115 | conda install conda-forge::perpetual
116 | ```
117 | 
118 | To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual):
119 | 
120 | ```shell
121 | cargo add perpetual
122 | ```
123 | 
124 | ## Contribution
125 | 
126 | Contributions are welcome. Check CONTRIBUTING.md for the guideline.
127 | 
128 | ## Paper
129 | 
130 | PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm.
131 | 


--------------------------------------------------------------------------------
/benches/perpetual_benchmarks.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
  2 | use perpetual::binning::bin_matrix;
  3 | use perpetual::constraints::ConstraintMap;
  4 | use perpetual::data::Matrix;
  5 | use perpetual::histogram::{NodeHistogram, NodeHistogramOwned};
  6 | use perpetual::objective_functions::{LogLoss, ObjectiveFunction};
  7 | use perpetual::splitter::{MissingImputerSplitter, SplitInfo, SplitInfoSlice};
  8 | use perpetual::tree::tree::Tree;
  9 | use perpetual::utils::{fast_f64_sum, fast_sum, naive_sum};
 10 | use perpetual::PerpetualBooster;
 11 | use std::fs;
 12 | use std::time::Duration;
 13 | 
 14 | pub fn tree_benchmarks(c: &mut Criterion) {
 15 |     let file = fs::read_to_string("resources/contiguous_no_missing_100k_samp_seed0.csv")
 16 |         .expect("Something went wrong reading the file");
 17 |     let data_vec: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
 18 |     let file =
 19 |         fs::read_to_string("resources/performance_100k_samp_seed0.csv").expect("Something went wrong reading the file");
 20 |     let y: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
 21 |     let yhat = vec![0.5; y.len()];
 22 |     let (mut g, mut h) = LogLoss::calc_grad_hess(&y, &yhat, None, None);
 23 |     let loss = LogLoss::calc_loss(&y, &yhat, None, None);
 24 | 
 25 |     let v: Vec<f32> = vec![10.; 300000];
 26 |     c.bench_function("Niave Sum", |b| b.iter(|| naive_sum(black_box(&v))));
 27 |     c.bench_function("fast sum", |b| b.iter(|| fast_sum(black_box(&v))));
 28 |     c.bench_function("fast f64 sum", |b| b.iter(|| fast_f64_sum(black_box(&v))));
 29 | 
 30 |     c.bench_function("calc_grad_hess", |b| {
 31 |         b.iter(|| LogLoss::calc_grad_hess(black_box(&y), black_box(&yhat), black_box(None), black_box(None)))
 32 |     });
 33 | 
 34 |     let data = Matrix::new(&data_vec, y.len(), 5);
 35 |     let splitter = MissingImputerSplitter::new(0.3, true, ConstraintMap::new());
 36 |     let mut tree = Tree::new();
 37 | 
 38 |     let bindata = bin_matrix(&data, None, 300, f64::NAN, None).unwrap();
 39 |     let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
 40 |     let col_index: Vec<usize> = (0..data.cols).collect();
 41 | 
 42 |     let n_nodes_alloc = 100;
 43 | 
 44 |     let mut hist_tree_owned: Vec<NodeHistogramOwned> = (0..n_nodes_alloc)
 45 |         .map(|_| NodeHistogramOwned::empty_from_cuts(&bindata.cuts, &col_index, false, true))
 46 |         .collect();
 47 | 
 48 |     let mut hist_tree: Vec<NodeHistogram> = hist_tree_owned
 49 |         .iter_mut()
 50 |         .map(|node_hist| NodeHistogram::from_owned(node_hist))
 51 |         .collect();
 52 | 
 53 |     let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap();
 54 | 
 55 |     let mut split_info_vec: Vec<SplitInfo> = (0..col_index.len()).map(|_| SplitInfo::default()).collect();
 56 |     let split_info_slice = SplitInfoSlice::new(&mut split_info_vec);
 57 | 
 58 |     tree.fit(
 59 |         &bdata,
 60 |         data.index.to_owned(),
 61 |         &col_index,
 62 |         &mut g,
 63 |         h.as_deref_mut(),
 64 |         &splitter,
 65 |         &pool,
 66 |         Some(f32::MAX),
 67 |         &loss,
 68 |         &y,
 69 |         LogLoss::calc_loss,
 70 |         &yhat,
 71 |         None,
 72 |         None,
 73 |         false,
 74 |         &mut hist_tree,
 75 |         None,
 76 |         &split_info_slice,
 77 |         n_nodes_alloc,
 78 |     );
 79 | 
 80 |     println!("{}", tree.nodes.len());
 81 |     c.bench_function("Train Tree", |b| {
 82 |         b.iter(|| {
 83 |             let mut train_tree: Tree = Tree::new();
 84 | 
 85 |             train_tree.fit(
 86 |                 black_box(&bdata),
 87 |                 black_box(data.index.to_owned()),
 88 |                 black_box(&col_index),
 89 |                 black_box(&mut g),
 90 |                 black_box(h.as_deref_mut()),
 91 |                 black_box(&splitter),
 92 |                 black_box(&pool),
 93 |                 Some(f32::MAX),
 94 |                 black_box(&loss),
 95 |                 black_box(&y),
 96 |                 black_box(LogLoss::calc_loss),
 97 |                 black_box(&yhat),
 98 |                 None,
 99 |                 None,
100 |                 false,
101 |                 black_box(&mut hist_tree),
102 |                 None,
103 |                 black_box(&split_info_slice),
104 |                 n_nodes_alloc,
105 |             );
106 |         })
107 |     });
108 |     c.bench_function("Train Tree - column subset", |b| {
109 |         b.iter(|| {
110 |             let mut train_tree: Tree = Tree::new();
111 | 
112 |             train_tree.fit(
113 |                 black_box(&bdata),
114 |                 black_box(data.index.to_owned()),
115 |                 black_box(&[1, 3, 4]),
116 |                 black_box(&mut g),
117 |                 black_box(h.as_deref_mut()),
118 |                 black_box(&splitter),
119 |                 black_box(&pool),
120 |                 Some(f32::MAX),
121 |                 black_box(&loss),
122 |                 black_box(&y),
123 |                 black_box(LogLoss::calc_loss),
124 |                 black_box(&yhat),
125 |                 None,
126 |                 None,
127 |                 false,
128 |                 black_box(&mut hist_tree),
129 |                 None,
130 |                 black_box(&split_info_slice),
131 |                 n_nodes_alloc,
132 |             );
133 |         })
134 |     });
135 |     c.bench_function("Tree Predict (Single Threaded)", |b| {
136 |         b.iter(|| tree.predict(black_box(&data), black_box(false), black_box(&f64::NAN)))
137 |     });
138 |     c.bench_function("Tree Predict (Multi Threaded)", |b| {
139 |         b.iter(|| tree.predict(black_box(&data), black_box(true), black_box(&f64::NAN)))
140 |     });
141 | 
142 |     // Gradient Booster
143 |     // Bench building
144 |     let mut booster_train = c.benchmark_group("train_booster");
145 |     booster_train.warm_up_time(Duration::from_secs(10));
146 |     booster_train.sample_size(50);
147 |     // booster_train.sampling_mode(SamplingMode::Linear);
148 |     booster_train.bench_function("train_booster_default", |b| {
149 |         b.iter(|| {
150 |             let mut booster = PerpetualBooster::default().set_budget(0.3);
151 |             booster.fit(black_box(&data), black_box(&y), black_box(None)).unwrap();
152 |         })
153 |     });
154 |     booster_train.bench_function("train_booster_with_column_sampling", |b| {
155 |         b.iter(|| {
156 |             let mut booster = PerpetualBooster::default().set_budget(0.3);
157 |             booster.fit(black_box(&data), black_box(&y), black_box(None)).unwrap();
158 |         })
159 |     });
160 |     let mut booster = PerpetualBooster::default().set_budget(0.1);
161 |     booster.fit(&data, &y, None).unwrap();
162 |     booster_train.bench_function("Predict Booster", |b| {
163 |         b.iter(|| booster.predict(black_box(&data), false))
164 |     });
165 | }
166 | 
167 | criterion_group!(benches, tree_benchmarks);
168 | criterion_main!(benches);
169 | 


--------------------------------------------------------------------------------
/examples/cal_housing.rs:
--------------------------------------------------------------------------------
  1 | //! An example using the `california housing` dataset
  2 | 
  3 | // cargo run --release --example cal_housing 1.0 1
  4 | 
  5 | // cargo build --release --example cal_housing
  6 | // hyperfine --runs 3 ./target/release/examples/cal_housing
  7 | // hyperfine --runs 3 .\target\release\examples\cal_housing
  8 | // hyperfine --runs 11 'cargo run --release --example cal_housing 0.1 0.3 2'
  9 | // hyperfine --runs 11 'cargo run --release --example cal_housing 2.0'
 10 | 
 11 | // cargo flamegraph --example cal_housing
 12 | 
 13 | use perpetual::{objective_functions::Objective, Matrix, PerpetualBooster};
 14 | use polars::prelude::*;
 15 | use std::env;
 16 | use std::error::Error;
 17 | use std::time::SystemTime;
 18 | 
 19 | pub fn mse(y_test: &[f64], y_pred: &[f64]) -> f32 {
 20 |     let mut error = 0.0;
 21 |     for i in 0..y_test.len() {
 22 |         error += (y_test[i] - y_pred[i]) * (y_test[i] - y_pred[i]);
 23 |     }
 24 |     let e = error / y_test.len() as f64;
 25 |     e as f32
 26 | }
 27 | 
 28 | fn main() -> Result<(), Box<dyn Error>> {
 29 |     let args: Vec<String> = env::args().collect();
 30 |     let budget = &args[1].parse::<f32>().unwrap_or(1.0);
 31 |     let num_threads = &args[2].parse::<usize>().unwrap_or(1);
 32 | 
 33 |     let all_names = [
 34 |         "MedInc".to_string(),
 35 |         "HouseAge".to_string(),
 36 |         "AveRooms".to_string(),
 37 |         "AveBedrms".to_string(),
 38 |         "Population".to_string(),
 39 |         "AveOccup".to_string(),
 40 |         "Latitude".to_string(),
 41 |         "Longitude".to_string(),
 42 |         "MedHouseVal".to_string(),
 43 |     ];
 44 | 
 45 |     let feature_names = [
 46 |         "MedInc".to_string(),
 47 |         "HouseAge".to_string(),
 48 |         "AveRooms".to_string(),
 49 |         "AveBedrms".to_string(),
 50 |         "Population".to_string(),
 51 |         "AveOccup".to_string(),
 52 |         "Latitude".to_string(),
 53 |         "Longitude".to_string(),
 54 |     ];
 55 | 
 56 |     let column_names_train = Arc::new(all_names.clone());
 57 |     let column_names_test = Arc::new(all_names.clone());
 58 | 
 59 |     let df_train = CsvReadOptions::default()
 60 |         .with_has_header(true)
 61 |         .with_columns(Some(column_names_train))
 62 |         .try_into_reader_with_file_path(Some("resources/cal_housing_train.csv".into()))?
 63 |         .finish()
 64 |         .unwrap();
 65 | 
 66 |     let df_test = CsvReadOptions::default()
 67 |         .with_has_header(true)
 68 |         .with_columns(Some(column_names_test))
 69 |         .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))?
 70 |         .finish()
 71 |         .unwrap();
 72 | 
 73 |     // Get data in column major format...
 74 |     let id_vars_train: Vec<&str> = Vec::new();
 75 |     let mdf_train = df_train.unpivot(feature_names.clone(), &id_vars_train)?;
 76 |     let id_vars_test: Vec<&str> = Vec::new();
 77 |     let mdf_test = df_test.unpivot(feature_names, &id_vars_test)?;
 78 | 
 79 |     let data_train = Vec::from_iter(
 80 |         mdf_train
 81 |             .select_at_idx(1)
 82 |             .expect("Invalid column")
 83 |             .f64()?
 84 |             .into_iter()
 85 |             .map(|v| v.unwrap_or(f64::NAN)),
 86 |     );
 87 |     let data_test = Vec::from_iter(
 88 |         mdf_test
 89 |             .select_at_idx(1)
 90 |             .expect("Invalid column")
 91 |             .f64()?
 92 |             .into_iter()
 93 |             .map(|v| v.unwrap_or(f64::NAN)),
 94 |     );
 95 | 
 96 |     let y_train = Vec::from_iter(
 97 |         df_train
 98 |             .column("MedHouseVal")?
 99 |             .cast(&DataType::Float64)?
100 |             .f64()?
101 |             .into_iter()
102 |             .map(|v| v.unwrap_or(f64::NAN)),
103 |     );
104 |     let y_test = Vec::from_iter(
105 |         df_test
106 |             .column("MedHouseVal")?
107 |             .cast(&DataType::Float64)?
108 |             .f64()?
109 |             .into_iter()
110 |             .map(|v| v.unwrap_or(f64::NAN)),
111 |     );
112 | 
113 |     // Create Matrix from ndarray.
114 |     let matrix_train = Matrix::new(&data_train, y_train.len(), 8);
115 |     let matrix_test = Matrix::new(&data_test, y_test.len(), 8);
116 | 
117 |     // Create booster.
118 |     // To provide parameters generate a default booster, and then use
119 |     // the relevant `set_` methods for any parameters you would like to
120 |     // adjust.
121 |     let mut model = PerpetualBooster::default()
122 |         .set_objective(Objective::SquaredLoss)
123 |         .set_num_threads(Some(*num_threads))
124 |         .set_budget(*budget);
125 | 
126 |     let now = SystemTime::now();
127 |     model.fit(&matrix_train, &y_train, None)?;
128 |     println!("now.elapsed: {:?}", now.elapsed().unwrap().as_secs_f32());
129 | 
130 |     let trees = model.get_prediction_trees();
131 |     println!("n_rounds: {:?}", trees.len());
132 | 
133 |     let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum();
134 |     println!("n_leaves: {:?}", n_leaves);
135 | 
136 |     let y_pred = model.predict(&matrix_train, true);
137 |     let error = mse(&y_train, &y_pred);
138 |     println!("mse_train: {:?}", error);
139 | 
140 |     let y_pred = model.predict(&matrix_test, true);
141 |     let error = mse(&y_test, &y_pred);
142 |     println!("mse_test: {:?}", error);
143 | 
144 |     println!("tree:");
145 |     for t in trees {
146 |         println!("{}", t);
147 |     }
148 | 
149 |     Ok(())
150 | }
151 | 


--------------------------------------------------------------------------------
/examples/cover_types.rs:
--------------------------------------------------------------------------------
  1 | //! An example using the `cover types` dataset
  2 | 
  3 | // cargo run --release --example cover_types 1.0
  4 | 
  5 | // cargo build --release --example cover_types
  6 | // hyperfine --runs 3 ./target/release/examples/cover_types
  7 | // hyperfine --runs 3 .\target\release\examples\cover_types 1.0
  8 | // hyperfine --runs 3 'cargo run --release --example cover_types 1.0'
  9 | 
 10 | // cargo flamegraph --example cover_types
 11 | 
 12 | use perpetual::{objective_functions::Objective, Matrix, PerpetualBooster};
 13 | use polars::prelude::*;
 14 | use std::env;
 15 | use std::error::Error;
 16 | 
 17 | pub fn mse(y_test: &[f64], y_pred: &[f64]) -> f32 {
 18 |     let mut error = 0.0;
 19 |     for i in 0..y_test.len() {
 20 |         error += (y_test[i] - y_pred[i]) * (y_test[i] - y_pred[i]);
 21 |     }
 22 |     let e = error / y_test.len() as f64;
 23 |     e as f32
 24 | }
 25 | 
 26 | pub fn multiclass_log_loss(y_true: &[f64], y_pred: &[Vec<f64>]) -> f64 {
 27 |     let mut losses = vec![0.0; y_true.len()];
 28 |     let eps = 1e-11;
 29 |     for (i, y_p) in y_pred.iter().enumerate() {
 30 |         let y_p_exp = y_p.iter().map(|e| e.exp()).collect::<Vec<f64>>();
 31 |         let y_p_exp_sum = y_p_exp.iter().sum::<f64>();
 32 |         let probabilities = y_p_exp.iter().map(|e| e / y_p_exp_sum).collect::<Vec<f64>>();
 33 |         let cls_idx = (y_true[i] - 1.0) as usize;
 34 |         let p = f64::max(eps, f64::min(1.0 - eps, probabilities[cls_idx]));
 35 |         losses[i] = -1.0 * p.ln();
 36 |     }
 37 |     losses.iter().sum::<f64>() / losses.len() as f64
 38 | }
 39 | 
 40 | fn main() -> Result<(), Box<dyn Error>> {
 41 |     let args: Vec<String> = env::args().collect();
 42 |     let budget = &args[1].parse::<f32>().unwrap_or(1.0);
 43 | 
 44 |     let mut features: Vec<&str> = [
 45 |         "Elevation",
 46 |         "Aspect",
 47 |         "Slope",
 48 |         "Horizontal_Distance_To_Hydrology",
 49 |         "Vertical_Distance_To_Hydrology",
 50 |         "Horizontal_Distance_To_Roadways",
 51 |         "Hillshade_9am",
 52 |         "Hillshade_Noon",
 53 |         "Hillshade_3pm",
 54 |         "Horizontal_Distance_To_Fire_Points",
 55 |         "Wilderness_Area_0",
 56 |         "Wilderness_Area_1",
 57 |         "Wilderness_Area_2",
 58 |         "Wilderness_Area_3",
 59 |     ]
 60 |     .to_vec();
 61 | 
 62 |     let soil_types = (0..40).map(|i| format!("{}_{}", "Soil_Type", i)).collect::<Vec<_>>();
 63 |     let s_types = soil_types.iter().map(|s| s.as_str()).collect::<Vec<_>>();
 64 |     features.extend(s_types);
 65 | 
 66 |     let mut features_and_target = features.clone();
 67 |     features_and_target.push("Cover_Type");
 68 | 
 69 |     let features_and_target_arc1 = features_and_target
 70 |         .iter()
 71 |         .map(|s| String::from(s.to_owned()))
 72 |         .collect::<Vec<String>>()
 73 |         .into();
 74 | 
 75 |     let features_and_target_arc2 = features_and_target
 76 |         .iter()
 77 |         .map(|s| String::from(s.to_owned()))
 78 |         .collect::<Vec<String>>()
 79 |         .into();
 80 | 
 81 |     let df_train = CsvReadOptions::default()
 82 |         .with_has_header(true)
 83 |         .with_columns(Some(features_and_target_arc1))
 84 |         .try_into_reader_with_file_path(Some("resources/cover_types_train.csv".into()))?
 85 |         .finish()
 86 |         .unwrap();
 87 | 
 88 |     let df_test = CsvReadOptions::default()
 89 |         .with_has_header(true)
 90 |         .with_columns(Some(features_and_target_arc2))
 91 |         .try_into_reader_with_file_path(Some("resources/cover_types_test.csv".into()))?
 92 |         .finish()
 93 |         .unwrap();
 94 | 
 95 |     // Get data in column major format...
 96 |     let id_vars_train: Vec<&str> = Vec::new();
 97 |     let mdf_train = df_train.unpivot(&features, &id_vars_train)?;
 98 |     let id_vars_test: Vec<&str> = Vec::new();
 99 |     let mdf_test = df_test.unpivot(&features, &id_vars_test)?;
100 | 
101 |     let data_train = Vec::from_iter(
102 |         mdf_train
103 |             .select_at_idx(1)
104 |             .expect("Invalid column")
105 |             .f64()?
106 |             .into_iter()
107 |             .map(|v| v.unwrap_or(f64::NAN)),
108 |     );
109 |     let data_test = Vec::from_iter(
110 |         mdf_test
111 |             .select_at_idx(1)
112 |             .expect("Invalid column")
113 |             .f64()?
114 |             .into_iter()
115 |             .map(|v| v.unwrap_or(f64::NAN)),
116 |     );
117 | 
118 |     let y_train = Vec::from_iter(
119 |         df_train
120 |             .column("Cover_Type")?
121 |             .cast(&DataType::Float64)?
122 |             .f64()?
123 |             .into_iter()
124 |             .map(|v| v.unwrap_or(f64::NAN)),
125 |     );
126 |     let y_test = Vec::from_iter(
127 |         df_test
128 |             .column("Cover_Type")?
129 |             .cast(&DataType::Float64)?
130 |             .f64()?
131 |             .into_iter()
132 |             .map(|v| v.unwrap_or(f64::NAN)),
133 |     );
134 | 
135 |     // Create Matrix from ndarray.
136 |     let matrix_train = Matrix::new(&data_train, y_train.len(), 54);
137 |     let matrix_test = Matrix::new(&data_test, y_test.len(), 54);
138 | 
139 |     let mut raw_train_array = vec![vec![0.0; 7]; y_train.len()];
140 |     let mut raw_test_array = vec![vec![0.0; 7]; y_test.len()];
141 |     for i in 1..8 {
142 |         println!();
143 | 
144 |         let mut model = PerpetualBooster::default()
145 |             .set_objective(Objective::LogLoss)
146 |             .set_budget(*budget);
147 | 
148 |         let y_tr: Vec<f64> = y_train
149 |             .iter()
150 |             .map(|y| if (*y as i32) == i { 1.0 } else { 0.0 })
151 |             .collect();
152 | 
153 |         model.fit(&matrix_train, &y_tr, None)?;
154 |         println!("Completed fitting model number: {}", i);
155 | 
156 |         let trees = model.get_prediction_trees();
157 |         println!("n_rounds: {:?}", trees.len());
158 | 
159 |         let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum();
160 |         println!("n_leaves: {:?}", n_leaves);
161 | 
162 |         let y_pred_train = model.predict(&matrix_train, true);
163 |         let y_pred_test = model.predict(&matrix_test, true);
164 | 
165 |         raw_train_array
166 |             .iter_mut()
167 |             .enumerate()
168 |             .for_each(|(idx, raw)| raw[(i - 1) as usize] = y_pred_train[idx]);
169 |         raw_test_array
170 |             .iter_mut()
171 |             .enumerate()
172 |             .for_each(|(idx, raw)| raw[(i - 1) as usize] = y_pred_test[idx]);
173 |     }
174 | 
175 |     let loss_train = multiclass_log_loss(&y_train, &raw_train_array);
176 |     let loss_test = multiclass_log_loss(&y_test, &raw_test_array);
177 | 
178 |     println!("loss_train: {}", loss_train);
179 |     println!("loss_test: {}", loss_test);
180 | 
181 |     Ok(())
182 | }
183 | 


--------------------------------------------------------------------------------
/examples/titanic.rs:
--------------------------------------------------------------------------------
 1 | //! An example using the `titanic` dataset
 2 | use perpetual::objective_functions::Objective;
 3 | use perpetual::{Matrix, PerpetualBooster};
 4 | use polars::prelude::*;
 5 | use std::env;
 6 | use std::error::Error;
 7 | 
 8 | fn main() -> Result<(), Box<dyn Error>> {
 9 |     let args: Vec<String> = env::args().collect();
10 |     let budget = &args[1].parse::<f32>().unwrap();
11 | 
12 |     let features_and_target = ["survived", "pclass", "age", "sibsp", "parch", "fare"];
13 | 
14 |     let features_and_target_arc = features_and_target
15 |         .iter()
16 |         .map(|s| String::from(s.to_owned()))
17 |         .collect::<Vec<String>>()
18 |         .into();
19 | 
20 |     let df = CsvReadOptions::default()
21 |         .with_has_header(true)
22 |         .with_columns(Some(features_and_target_arc))
23 |         .try_into_reader_with_file_path(Some("resources/titanic.csv".into()))?
24 |         .finish()
25 |         .unwrap();
26 | 
27 |     // Get data in column major format...
28 |     let id_vars: Vec<&str> = Vec::new();
29 |     let mdf = df.unpivot(["pclass", "age", "sibsp", "parch", "fare"], id_vars)?;
30 | 
31 |     let data = Vec::from_iter(
32 |         mdf.select_at_idx(1)
33 |             .expect("Invalid column")
34 |             .f64()?
35 |             .into_iter()
36 |             .map(|v| v.unwrap_or(f64::NAN)),
37 |     );
38 |     let y = Vec::from_iter(
39 |         df.column("survived")?
40 |             .cast(&DataType::Float64)?
41 |             .f64()?
42 |             .into_iter()
43 |             .map(|v| v.unwrap_or(f64::NAN)),
44 |     );
45 | 
46 |     // Create Matrix from ndarray.
47 |     let matrix = Matrix::new(&data, y.len(), 5);
48 | 
49 |     // Create booster.
50 |     // To provide parameters generate a default booster, and then use
51 |     // the relevant `set_` methods for any parameters you would like to
52 |     // adjust.
53 |     let mut model = PerpetualBooster::default()
54 |         .set_objective(Objective::LogLoss)
55 |         .set_budget(*budget);
56 |     model.fit(&matrix, &y, None)?;
57 | 
58 |     println!("Model prediction: {:?} ...", &model.predict(&matrix, true)[0..10]);
59 | 
60 |     Ok(())
61 | }
62 | 


--------------------------------------------------------------------------------
/python-package/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | .pytest_cache/
 6 | *.py[cod]
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | .venv/
14 | env/
15 | bin/
16 | build/
17 | develop-eggs/
18 | dist/
19 | eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | include/
26 | man/
27 | venv/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | pip-selfcheck.json
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | 
45 | # Translations
46 | *.mo
47 | 
48 | # Mr Developer
49 | .mr.developer.cfg
50 | .project
51 | .pydevproject
52 | 
53 | # Rope
54 | .ropeproject
55 | 
56 | # Django stuff:
57 | *.log
58 | *.pot
59 | 
60 | .DS_Store
61 | 
62 | # Sphinx documentation
63 | docs/_build/
64 | 
65 | # PyCharm
66 | .idea/
67 | 
68 | # VSCode
69 | .vscode/
70 | 
71 | # Pyenv
72 | .python-version


--------------------------------------------------------------------------------
/python-package/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "py-perpetual"
 3 | version = "0.9.3"
 4 | edition = "2021"
 5 | authors = ["Mutlu Simsek <msimsek@perpetual-ml.com>"]
 6 | homepage = "https://perpetual-ml.com"
 7 | description = "A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization"
 8 | license-file = "LICENSE"
 9 | readme = "README.md"
10 | repository = "https://github.com/perpetual-ml/perpetual"
11 | 
12 | keywords = ["machine-learning", "perpetual", "ai", "ml"]
13 | categories = ["algorithms", "mathematics", "science"]
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | [lib]
17 | name = "perpetual"
18 | crate-type = ["cdylib", "rlib"]
19 | 
20 | [dependencies]
21 | pyo3 = { version = "0.24.1", features = ["extension-module"] }
22 | perpetual_rs = {package="perpetual", version = "0.9.3", path = "../" }
23 | numpy = "0.24.0"
24 | ndarray = "0.16.1"
25 | serde_plain = { version = "1.0.2" }
26 | serde = { version = "1.0.219" }
27 | pyo3-log = "0.12.3"
28 | 


--------------------------------------------------------------------------------
/python-package/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Perpetual
 2 | 
 3 | ## Python API Reference
 4 | 
 5 | <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/perpetual?link=https%3A%2F%2Fpypi.org%2Fproject%2Fperpetual%2F">
 6 | 
 7 | <img alt="Crates.io Version" src="https://img.shields.io/crates/v/perpetual?link=https%3A%2F%2Fcrates.io%2Fcrates%2Fperpetual">
 8 | 
 9 | The `PerpetualBooster` class is currently the only public facing class in the package, and can be used to train gradient boosted decision tree ensembles with multiple objective functions.
10 | 
11 | ::: perpetual.PerpetualBooster
12 | 
13 | ## Logging output
14 | 
15 | Info is logged while the model is being trained if the `log_iterations` parameter is set to a value greater than `0` while fitting the booster. The logs can be printed to stdout while training like so.
16 | 
17 | ```python
18 | import logging
19 | logging.basicConfig()
20 | logging.getLogger().setLevel(logging.INFO)
21 | 
22 | model = PerpetualBooster(log_iterations=1)
23 | model.fit(X, y)
24 | 
25 | # INFO:perpetual.perpetualbooster:Completed iteration 0 of 10
26 | # INFO:perpetual.perpetualbooster:Completed iteration 1 of 10
27 | # INFO:perpetual.perpetualbooster:Completed iteration 2 of 10
28 | ```
29 | 
30 | The log output can also be captured in a file also using the `logging.basicConfig()` `filename` option.
31 | 
32 | ```python
33 | import logging
34 | logging.basicConfig(filename="training-info.log")
35 | logging.getLogger().setLevel(logging.INFO)
36 | 
37 | model = PerpetualBooster(log_iterations=10)
38 | model.fit(X, y)
39 | ```
40 | 


--------------------------------------------------------------------------------
/python-package/examples/benchmark_lgbm.py:
--------------------------------------------------------------------------------
  1 | import optuna
  2 | import numpy as np
  3 | from time import process_time, time
  4 | from functools import partial
  5 | from lightgbm import LGBMRegressor, LGBMClassifier
  6 | from sklearn.metrics import mean_squared_error, log_loss
  7 | from sklearn.datasets import fetch_covtype, fetch_california_housing
  8 | from sklearn.model_selection import train_test_split, cross_validate
  9 | 
 10 | 
 11 | def prepare_data(cal_housing, seed):
 12 |     if cal_housing:
 13 |         data, target = fetch_california_housing(return_X_y=True, as_frame=True)
 14 |         scoring = "neg_mean_squared_error"
 15 |         metric_function = mean_squared_error
 16 |         metric_name = "mse"
 17 |         LGBMBooster = LGBMRegressor
 18 |     else:
 19 |         data, target = fetch_covtype(return_X_y=True, as_frame=True)
 20 |         scoring = "neg_log_loss"
 21 |         metric_function = log_loss
 22 |         metric_name = "log_loss"
 23 |         LGBMBooster = LGBMClassifier
 24 |     X_train, X_test, y_train, y_test = train_test_split(
 25 |         data, target, test_size=0.2248, random_state=seed
 26 |     )
 27 |     return (
 28 |         X_train,
 29 |         X_test,
 30 |         y_train,
 31 |         y_test,
 32 |         scoring,
 33 |         metric_function,
 34 |         metric_name,
 35 |         LGBMBooster,
 36 |     )
 37 | 
 38 | 
 39 | best_cv_results = None
 40 | cv_results = None
 41 | 
 42 | 
 43 | def save_best_cv_results(study, trial):
 44 |     global best_cv_results
 45 |     if study.best_trial.number == trial.number:
 46 |         best_cv_results = cv_results
 47 | 
 48 | 
 49 | def objective_function(
 50 |     trial, seed, n_estimators, LGBMBooster, X_train, y_train, scoring
 51 | ):
 52 |     global cv_results
 53 |     params = {
 54 |         "seed": seed,
 55 |         "verbosity": -1,
 56 |         "n_estimators": n_estimators,
 57 |         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True),
 58 |         "min_split_gain": trial.suggest_float("min_split_gain", 1e-6, 1.0, log=True),
 59 |         "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 1.0, log=True),
 60 |         "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 1.0, log=True),
 61 |         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
 62 |         "subsample": trial.suggest_float("subsample", 0.2, 1.0),
 63 |         "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
 64 |         "max_depth": trial.suggest_int("max_depth", 3, 33),
 65 |         "num_leaves": trial.suggest_int("num_leaves", 2, 1024),
 66 |         "min_child_samples": trial.suggest_int("min_child_samples", 1, 100),
 67 |     }
 68 |     model = LGBMBooster(**params)
 69 |     cv_results = cross_validate(
 70 |         model,
 71 |         X_train,
 72 |         y_train,
 73 |         cv=5,
 74 |         scoring=scoring,
 75 |         return_train_score=True,
 76 |         return_estimator=True,
 77 |     )
 78 |     return -1 * np.mean(cv_results["test_score"])
 79 | 
 80 | 
 81 | if __name__ == "__main__":
 82 |     optuna.logging.set_verbosity(optuna.logging.WARNING)
 83 |     cal_housing = True  # True -> California Housing, False -> Cover Types
 84 |     n_estimators = 100
 85 |     n_trials = 100
 86 |     cpu_times = []
 87 |     wall_times = []
 88 |     metrics = []
 89 | 
 90 |     for seed in range(5):
 91 |         (
 92 |             X_train,
 93 |             X_test,
 94 |             y_train,
 95 |             y_test,
 96 |             scoring,
 97 |             metric_function,
 98 |             metric_name,
 99 |             LGBMBooster,
100 |         ) = prepare_data(cal_housing, seed)
101 | 
102 |         sampler = optuna.samplers.TPESampler(seed=seed)
103 |         study = optuna.create_study(direction="minimize", sampler=sampler)
104 | 
105 |         obj = partial(
106 |             objective_function,
107 |             seed=seed,
108 |             n_estimators=n_estimators,
109 |             LGBMBooster=LGBMBooster,
110 |             X_train=X_train,
111 |             y_train=y_train,
112 |             scoring=scoring,
113 |         )
114 | 
115 |         start = process_time()
116 |         tick = time()
117 |         study.optimize(obj, n_trials=n_trials, callbacks=[save_best_cv_results])
118 |         stop = process_time()
119 |         cpu_times.append(stop - start)
120 |         wall_times.append(time() - tick)
121 | 
122 |         models = best_cv_results["estimator"]
123 |         if metric_name == "log_loss":
124 |             y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)
125 |         else:
126 |             y_pred = np.mean([model.predict(X_test) for model in models], axis=0)
127 |         metric = metric_function(y_test, y_pred)
128 |         metrics.append(metric)
129 | 
130 |         print(f"seed: {seed}, cpu time: {stop - start}, {metric_name}: {metric}")
131 | 
132 |     print(f"avg cpu time: {np.mean(cpu_times)}, avg {metric_name}: {np.mean(metrics)}")
133 |     print(f"avg wall time: {np.mean(wall_times)}")
134 |     print(f"cpu time / wall time: {(np.mean(cpu_times)/np.mean(wall_times)):.1f}")
135 | 


--------------------------------------------------------------------------------
/python-package/examples/benchmark_perpetual.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from time import process_time, time
 3 | from perpetual import PerpetualBooster
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.metrics import mean_squared_error, log_loss
 6 | from sklearn.datasets import fetch_covtype, fetch_california_housing
 7 | from importlib.metadata import version
 8 | 
 9 | 
10 | def prepare_data(cal_housing, seed):
11 |     if cal_housing:
12 |         data, target = fetch_california_housing(return_X_y=True, as_frame=True)
13 |         metric_function = mean_squared_error
14 |         metric_name = "mse"
15 |         objective = "SquaredLoss"
16 |     else:
17 |         data, target = fetch_covtype(return_X_y=True, as_frame=True)
18 |         metric_function = log_loss
19 |         metric_name = "log_loss"
20 |         objective = "LogLoss"
21 |     X_train, X_test, y_train, y_test = train_test_split(
22 |         data, target, test_size=0.2248, random_state=seed
23 |     )
24 |     return X_train, X_test, y_train, y_test, metric_function, metric_name, objective
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     print(f"perpetual: {version('perpetual')}")
29 |     budget = 1.0
30 |     num_threads = 2
31 |     cal_housing = True  # True -> California Housing, False -> Cover Types
32 |     cpu_times = []
33 |     wall_times = []
34 |     metrics = []
35 | 
36 |     for seed in range(5):
37 |         X_train, X_test, y_train, y_test, metric_function, metric_name, objective = (
38 |             prepare_data(cal_housing, seed)
39 |         )
40 | 
41 |         model = PerpetualBooster(
42 |             objective=objective, num_threads=num_threads, log_iterations=0
43 |         )
44 | 
45 |         start = process_time()
46 |         tick = time()
47 |         model.fit(X_train, y_train, budget=budget)
48 |         stop = process_time()
49 |         cpu_times.append(stop - start)
50 |         wall_times.append(time() - tick)
51 | 
52 |         if metric_name == "log_loss":
53 |             y_pred = model.predict_proba(X_test)
54 |         else:
55 |             y_pred = model.predict(X_test)
56 |         metric = metric_function(y_test, y_pred)
57 |         metrics.append(metric)
58 | 
59 |         print(f"seed: {seed}, cpu time: {stop - start}, {metric_name}: {metric}")
60 | 
61 |     print(f"avg cpu time: {np.mean(cpu_times)}, avg {metric_name}: {np.mean(metrics)}")
62 |     print(f"avg wall time: {np.mean(wall_times)}")
63 |     print(f"cpu time / wall time: {(np.mean(cpu_times)/np.mean(wall_times)):.1f}")
64 | 


--------------------------------------------------------------------------------
/python-package/examples/categorical_data_titanic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import seaborn as sns\n",
 12 |     "from scipy.special import expit\n",
 13 |     "from lightgbm import LGBMClassifier\n",
 14 |     "from sklearn.metrics import log_loss, accuracy_score\n",
 15 |     "from sklearn.model_selection import train_test_split\n",
 16 |     "from perpetual import PerpetualBooster"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "pd.set_option('display.max_rows', 1000)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "!python --version"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from importlib.metadata import version\n",
 44 |     "\n",
 45 |     "print(f\"numpy: {version('numpy')}\")\n",
 46 |     "print(f\"optuna: {version('optuna')}\")\n",
 47 |     "print(f\"lightgbm: {version('lightgbm')}\")\n",
 48 |     "print(f\"scikit-learn: {version('scikit-learn')}\")\n",
 49 |     "print(f\"perpetual: {version('perpetual')}\")"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 5,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "df = sns.load_dataset(\"titanic\")"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "df.drop(columns=[\"alive\"], inplace=True)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 7,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "X = df.drop(columns=[\"survived\"])\n",
 77 |     "y = df[\"survived\"]"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "X.shape"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "X.dtypes"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "X.nunique()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "X.head()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "X[\"sex\"] = pd.get_dummies(X[\"sex\"], drop_first=True, dtype=float).to_numpy()\n",
123 |     "X[\"adult_male\"] = pd.get_dummies(X[\"adult_male\"], drop_first=True, dtype=float).to_numpy()\n",
124 |     "# X[\"alive\"] = pd.get_dummies(X[\"alive\"], drop_first=True, dtype=float).to_numpy()\n",
125 |     "X[\"alone\"] = pd.get_dummies(X[\"alone\"], drop_first=True, dtype=float).to_numpy()\n",
126 |     "cols = ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town', 'age', 'fare']\n",
127 |     "X[cols] = X[cols].astype('category')\n",
128 |     "X.head()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 13,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "seed = 42\n",
138 |     "n_estimators = 100\n",
139 |     "n_trials = 1"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 14,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "scoring = \"neg_log_loss\"\n",
149 |     "metric_function = log_loss\n",
150 |     "metric_name = \"log_loss\"\n",
151 |     "objective_type = \"LogLoss\""
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)\n",
161 |     "\n",
162 |     "print(f\"X_train.shape: {X_train.shape}\")\n",
163 |     "print(f\"X_test.shape: {X_test.shape}\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "X_train.head()"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "set(X_train[\"who\"])"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "model_lgbm = LGBMClassifier(objective=\"binary\")\n",
191 |     "model_lgbm.fit(X_train, y_train)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "model = PerpetualBooster(objective=\"LogLoss\")\n",
201 |     "model.fit(X_train, y_train, budget=0.1)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "y_pred = np.round(expit(model.predict(X_test)))\n",
211 |     "print(accuracy_score(y_test, y_pred))"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "y_pred = np.round(expit(model.predict(X_train)))\n",
221 |     "print(accuracy_score(y_train, y_pred))"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "if metric_name == \"log_loss\":\n",
231 |     "    y_pred = expit(model.predict(X_test))\n",
232 |     "else:\n",
233 |     "    y_pred = np.round(expit(model.predict(X_test)))\n",
234 |     "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 23,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "df_trees = model.trees_to_dataframe()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "df_trees.head(10)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "model_lgbm.booster_.trees_to_dataframe().head(10)"
262 |    ]
263 |   }
264 |  ],
265 |  "metadata": {
266 |   "kernelspec": {
267 |    "display_name": "py311",
268 |    "language": "python",
269 |    "name": "python3"
270 |   },
271 |   "language_info": {
272 |    "codemirror_mode": {
273 |     "name": "ipython",
274 |     "version": 3
275 |    },
276 |    "file_extension": ".py",
277 |    "mimetype": "text/x-python",
278 |    "name": "python",
279 |    "nbconvert_exporter": "python",
280 |    "pygments_lexer": "ipython3",
281 |    "version": "3.11.9"
282 |   }
283 |  },
284 |  "nbformat": 4,
285 |  "nbformat_minor": 2
286 | }
287 | 


--------------------------------------------------------------------------------
/python-package/examples/fetch_openml.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import logging\n",
 10 |     "from perpetual import PerpetualBooster\n",
 11 |     "from sklearn.datasets import fetch_openml\n",
 12 |     "from importlib.metadata import version"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "logging.basicConfig(level=logging.INFO)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "print(f\"perpetual: {version('perpetual')}\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "data, target = fetch_openml(data_id=45667, return_X_y=True, as_frame=True)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "data.shape"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "data.dtypes"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "model = PerpetualBooster(objective=\"SquaredLoss\", log_iterations=1)\n",
 67 |     "model.fit(data, target, budget=0.5)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "model.number_of_trees"
 77 |    ]
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "py311",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.11.9"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 2
101 | }
102 | 


--------------------------------------------------------------------------------
/python-package/examples/lgbm_openml_sensory.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from sklearn.datasets import fetch_openml\n",
 11 |     "from lightgbm import LGBMRegressor"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "pd.set_option('display.max_rows', 500)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "df = fetch_openml(data_id=546)\n",
 30 |     "X = df.data\n",
 31 |     "y = df.target"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "model = LGBMRegressor(n_estimators=10, max_depth=1, learning_rate=0.1, max_cat_to_onehot=1, cat_l2=0.0, cat_smooth=0.0, min_data_per_group=1, max_cat_threshold=1000)\n",
 41 |     "model.fit(X, y)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "X.dtypes"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "list(X.columns)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "df = model.booster_.trees_to_dataframe()\n",
 69 |     "df_mod = df.loc[df[\"weight\"] == 0]\n",
 70 |     "df_mod.head(10)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "list(df_mod[\"split_gain\"])"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "model.booster_.trees_to_dataframe()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "from xgboost import XGBRegressor"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "xgb = XGBRegressor(n_estimators=10, learning_rate=0.1, max_depth=1, enable_categorical=True, max_cat_to_onehot=1, max_cat_threshold=100, reg_alpha=0.0, reg_lambda=0.0)\n",
107 |     "xgb.fit(X, y)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "xgb.get_booster().trees_to_dataframe()"
117 |    ]
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "py311",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.11.9"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 2
141 | }
142 | 


--------------------------------------------------------------------------------
/python-package/examples/openml.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from scipy.io import arff\n",
 12 |     "from perpetual import PerpetualBooster"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "from importlib.metadata import version\n",
 22 |     "\n",
 23 |     "print(f\"numpy: {version('numpy')}\")\n",
 24 |     "print(f\"pandas: {version('pandas')}\")\n",
 25 |     "print(f\"scipy: {version('scipy')}\")\n",
 26 |     "print(f\"perpetual: {version('perpetual')}\")"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "arff_file = arff.loadarff('../../resources/christine.arff')\n",
 36 |     "df = pd.DataFrame(arff_file[0])"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "df.head()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "df.shape"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "y = df.pop('class')\n",
 64 |     "y"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "y = np.array(y).astype(int)\n",
 74 |     "y"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "df.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "df.dtypes"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "idx = [i for i, e in enumerate(list(df.dtypes)) if e==np.dtype('O')]\n",
102 |     "cat_features = np.array(df.columns)[idx]\n",
103 |     "print(cat_features)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 11,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "df[cat_features] = df[cat_features].astype(\"category\")"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "from perpetual.utils import convert_input_frame\n",
122 |     "\n",
123 |     "features_, df_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(df, \"auto\", 1000)\n",
124 |     "\n",
125 |     "pd.Series(df_flat).to_csv(\"../../resources/christine_flat.csv\", index=False, header=False)\n",
126 |     "pd.Series(y).to_csv(\"../../resources/christine_y.csv\", index=False, header=False)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 13,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "model = PerpetualBooster(log_iterations=1)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "model.fit(df, y, budget=0.1)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "model.number_of_trees"
154 |    ]
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "py311",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.11.9"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 2
178 | }
179 | 


--------------------------------------------------------------------------------
/python-package/examples/openml_mnist.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pandas as pd\n",
10 |     "from perpetual import PerpetualBooster"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "import logging\n",
20 |     "\n",
21 |     "logging.basicConfig()\n",
22 |     "logging.getLogger().setLevel(logging.DEBUG)"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {},
29 |    "outputs": [],
30 |    "source": [
31 |     "X_train = pd.read_csv(\"../../resources/fashion_train_flat.csv\", index_col=False, header=None).to_numpy().reshape(63000, -1)\n",
32 |     "X_train.shape"
33 |    ]
34 |   },
35 |   {
36 |    "cell_type": "code",
37 |    "execution_count": null,
38 |    "metadata": {},
39 |    "outputs": [],
40 |    "source": [
41 |     "y_train = pd.read_csv(\"../../resources/fashion_train_y.csv\", index_col=False, header=None).to_numpy().flatten()\n",
42 |     "y_train.shape"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "model = PerpetualBooster(log_iterations=1)\n",
52 |     "model.fit(X_train, y_train, budget=1.0, timeout=360)"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "model.number_of_trees"
62 |    ]
63 |   }
64 |  ],
65 |  "metadata": {
66 |   "kernelspec": {
67 |    "display_name": "py311",
68 |    "language": "python",
69 |    "name": "python3"
70 |   },
71 |   "language_info": {
72 |    "codemirror_mode": {
73 |     "name": "ipython",
74 |     "version": 3
75 |    },
76 |    "file_extension": ".py",
77 |    "mimetype": "text/x-python",
78 |    "name": "python",
79 |    "nbconvert_exporter": "python",
80 |    "pygments_lexer": "ipython3",
81 |    "version": "3.11.9"
82 |   }
83 |  },
84 |  "nbformat": 4,
85 |  "nbformat_minor": 2
86 | }
87 | 


--------------------------------------------------------------------------------
/python-package/examples/santander.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "967af9d9",
  7 |    "metadata": {
  8 |     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
  9 |     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
 10 |     "execution": {
 11 |      "iopub.execute_input": "2024-10-21T07:01:07.130508Z",
 12 |      "iopub.status.busy": "2024-10-21T07:01:07.130061Z",
 13 |      "iopub.status.idle": "2024-10-21T07:01:08.048111Z",
 14 |      "shell.execute_reply": "2024-10-21T07:01:08.046970Z"
 15 |     },
 16 |     "papermill": {
 17 |      "duration": 0.926499,
 18 |      "end_time": "2024-10-21T07:01:08.050965",
 19 |      "exception": false,
 20 |      "start_time": "2024-10-21T07:01:07.124466",
 21 |      "status": "completed"
 22 |     },
 23 |     "tags": []
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import numpy as np\n",
 28 |     "import pandas as pd\n",
 29 |     "from perpetual import PerpetualBooster\n",
 30 |     "from sklearn.metrics import roc_auc_score\n",
 31 |     "from sklearn.model_selection import KFold\n",
 32 |     "from autogluon.tabular import TabularPredictor"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "id": "c257f8fc",
 39 |    "metadata": {
 40 |     "execution": {
 41 |      "iopub.execute_input": "2024-10-21T07:01:28.223537Z",
 42 |      "iopub.status.busy": "2024-10-21T07:01:28.222764Z",
 43 |      "iopub.status.idle": "2024-10-21T07:01:34.667262Z",
 44 |      "shell.execute_reply": "2024-10-21T07:01:34.666013Z"
 45 |     },
 46 |     "papermill": {
 47 |      "duration": 6.453134,
 48 |      "end_time": "2024-10-21T07:01:34.670004",
 49 |      "exception": false,
 50 |      "start_time": "2024-10-21T07:01:28.216870",
 51 |      "status": "completed"
 52 |     },
 53 |     "tags": []
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "X_train = pd.read_csv('../../resources/santander-train.csv', index_col=0)\n",
 58 |     "y_train = X_train.pop('TARGET')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "e0590d0a",
 65 |    "metadata": {
 66 |     "execution": {
 67 |      "iopub.execute_input": "2024-10-21T07:01:34.680697Z",
 68 |      "iopub.status.busy": "2024-10-21T07:01:34.680290Z",
 69 |      "iopub.status.idle": "2024-10-21T07:01:34.689412Z",
 70 |      "shell.execute_reply": "2024-10-21T07:01:34.688210Z"
 71 |     },
 72 |     "papermill": {
 73 |      "duration": 0.017414,
 74 |      "end_time": "2024-10-21T07:01:34.691792",
 75 |      "exception": false,
 76 |      "start_time": "2024-10-21T07:01:34.674378",
 77 |      "status": "completed"
 78 |     },
 79 |     "tags": []
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "X_train.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 4,
 89 |    "id": "22eba1d7",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "cv = KFold(shuffle=True, random_state=42)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "id": "921f491f",
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "scores = []\n",
104 |     "for train, test in cv.split(X_train, y_train):\n",
105 |     "    model = PerpetualBooster(objective=\"LogLoss\")\n",
106 |     "    model.fit(X_train.iloc[train], y_train.iloc[train], budget=1.0)\n",
107 |     "    probabilities = model.predict_proba(X_train.iloc[test])\n",
108 |     "    score = roc_auc_score(y_train.iloc[test], probabilities[:, 1])\n",
109 |     "    scores.append(score)\n",
110 |     "    print(model.number_of_trees)\n",
111 |     "print(np.mean(scores))"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "0a04e569",
118 |    "metadata": {
119 |     "execution": {
120 |      "iopub.execute_input": "2024-10-21T07:01:34.702317Z",
121 |      "iopub.status.busy": "2024-10-21T07:01:34.701880Z",
122 |      "iopub.status.idle": "2024-10-21T07:02:04.983918Z",
123 |      "shell.execute_reply": "2024-10-21T07:02:04.982720Z"
124 |     },
125 |     "papermill": {
126 |      "duration": 30.294535,
127 |      "end_time": "2024-10-21T07:02:04.990727",
128 |      "exception": false,
129 |      "start_time": "2024-10-21T07:01:34.696192",
130 |      "status": "completed"
131 |     },
132 |     "tags": []
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "scores = []\n",
137 |     "X_train['TARGET'] = y_train\n",
138 |     "for train, test in cv.split(X_train, y_train):\n",
139 |     "    model = TabularPredictor(label=\"TARGET\", verbosity=0)\n",
140 |     "    model.fit(X_train.iloc[train])\n",
141 |     "    probabilities = model.predict_proba(X_train.iloc[test])\n",
142 |     "    score = roc_auc_score(y_train.iloc[test], probabilities.to_numpy()[:, 1])\n",
143 |     "    print(score)\n",
144 |     "    scores.append(score)\n",
145 |     "print(np.mean(scores))"
146 |    ]
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kaggle": {
151 |    "accelerator": "none",
152 |    "dataSources": [
153 |     {
154 |      "databundleVersionId": 860641,
155 |      "sourceId": 4986,
156 |      "sourceType": "competition"
157 |     }
158 |    ],
159 |    "dockerImageVersionId": 30786,
160 |    "isGpuEnabled": false,
161 |    "isInternetEnabled": true,
162 |    "language": "python",
163 |    "sourceType": "notebook"
164 |   },
165 |   "kernelspec": {
166 |    "display_name": "py311",
167 |    "language": "python",
168 |    "name": "python3"
169 |   },
170 |   "language_info": {
171 |    "codemirror_mode": {
172 |     "name": "ipython",
173 |     "version": 3
174 |    },
175 |    "file_extension": ".py",
176 |    "mimetype": "text/x-python",
177 |    "name": "python",
178 |    "nbconvert_exporter": "python",
179 |    "pygments_lexer": "ipython3",
180 |    "version": "3.11.9"
181 |   },
182 |   "papermill": {
183 |    "default_parameters": {},
184 |    "duration": 62.454609,
185 |    "end_time": "2024-10-21T07:02:06.520206",
186 |    "environment_variables": {},
187 |    "exception": null,
188 |    "input_path": "__notebook__.ipynb",
189 |    "output_path": "__notebook__.ipynb",
190 |    "parameters": {},
191 |    "start_time": "2024-10-21T07:01:04.065597",
192 |    "version": "2.6.0"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 5
197 | }
198 | 


--------------------------------------------------------------------------------
/python-package/examples/toy_datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import time\n",
 10 |     "import logging\n",
 11 |     "import pandas as pd\n",
 12 |     "from sklearn.datasets import load_breast_cancer, load_iris\n",
 13 |     "from sklearn.ensemble import RandomForestClassifier\n",
 14 |     "from sklearn.model_selection import train_test_split\n",
 15 |     "from sklearn.metrics import accuracy_score, log_loss\n",
 16 |     "from perpetual import PerpetualBooster"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from importlib.metadata import version\n",
 26 |     "\n",
 27 |     "print(f\"scikit-learn: {version('scikit-learn')}\")\n",
 28 |     "print(f\"perpetual: {version('perpetual')}\")"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "logging.basicConfig(level=logging.INFO)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n",
 47 |     "    start = time.time()\n",
 48 |     "    model.fit(X_train, y_train, budget=budget) if budget else model.fit(X_train, y_train)\n",
 49 |     "    if budget:\n",
 50 |     "        print(model.number_of_trees)\n",
 51 |     "    duration = time.time() - start\n",
 52 |     "    return duration, accuracy_score(y_test, model.predict(X_test)), log_loss(y_test, model.predict_proba(X_test))\n",
 53 |     "\n",
 54 |     "datasets = {\"Breast Cancer\": load_breast_cancer(return_X_y=True), \"Binary Iris\": (load_iris(return_X_y=True)[0][load_iris().target!=2], load_iris(return_X_y=True)[1][load_iris().target!=2])}\n",
 55 |     "results = pd.DataFrame(columns=[\"Dataset\", \"Model\", \"Budget\", \"Time\", \"Accuracy\", \"Log Loss\"])\n",
 56 |     "\n",
 57 |     "for name, (X, y) in datasets.items():\n",
 58 |     "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
 59 |     "    pb = PerpetualBooster(objective=\"LogLoss\", log_iterations=1, stopping_rounds=1, iteration_limit=1000)\n",
 60 |     "    rf = RandomForestClassifier()\n",
 61 |     "    results = pd.concat([results,\n",
 62 |     "                         pd.DataFrame([[name, \"Perpetual\", \"0.1\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1)]], columns=results.columns),\n",
 63 |     "                         #pd.DataFrame([[name, \"Perpetual\", \"1.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=1.0)]], columns=results.columns),\n",
 64 |     "                         #pd.DataFrame([[name, \"Perpetual\", \"2.0\", *evaluate(pb, X_train, y_train, X_test, y_test, budget=2.0)]], columns=results.columns),\n",
 65 |     "                         #pd.DataFrame([[name, \"RF\", \"-\", *evaluate(rf, X_train, y_train, X_test, y_test)]], columns=results.columns),\n",
 66 |     "                        ],\n",
 67 |     "                    ignore_index=True)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "results"
 77 |    ]
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "py311",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.11.9"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 2
101 | }
102 | 


--------------------------------------------------------------------------------
/python-package/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: perpetual
 2 | repo_name: perpetual
 3 | repo_url: https://github.com/perpetual-ml/perpetual/
 4 | theme:
 5 |   name: material
 6 |   icon:
 7 |     repo: fontawesome/brands/github
 8 |     logo: material/forest-outline
 9 |   palette:
10 |     # Palette toggle for light mode
11 |     - scheme: default
12 |       primary: deep purple
13 |       toggle:
14 |         icon: material/toggle-switch
15 |         name: Switch to dark mode
16 |       # Palette toggle for dark mode
17 |     - scheme: slate
18 |       primary: deep purple
19 |       toggle:
20 |         icon: material/toggle-switch-off-outline
21 |         name: Switch to light mode
22 | 
23 | markdown_extensions:
24 |   - pymdownx.highlight:
25 |       anchor_linenums: true
26 |       line_spans: __span
27 |       pygments_lang_class: true
28 |   - pymdownx.inlinehilite
29 |   - pymdownx.snippets
30 |   - pymdownx.superfences
31 | 
32 | plugins:
33 |   - search
34 |   - autorefs
35 |   - mkdocstrings:
36 |       handlers:
37 |         python:
38 |           options:
39 |             heading_level: 2
40 |             docstring_section_style: list
41 |             members_order: source
42 |             show_root_heading: true
43 |             show_root_full_path: false
44 |             separate_signature: true
45 |             show_source: false
46 |             show_signature_annotations: true
47 |             merge_init_into_class: true
48 | 


--------------------------------------------------------------------------------
/python-package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.0,<2.0"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "perpetual"
 7 | version = "0.9.3"
 8 | description = "A self-generalizing gradient boosting machine that doesn't need hyperparameter optimization"
 9 | keywords = [
10 |   "rust",
11 |   "perpetual",
12 |   "machine learning",
13 |   "tree model",
14 |   "decision tree",
15 |   "gradient boosted decision tree",
16 |   "gradient boosting machine"
17 | ]
18 | authors = [{ name = "Mutlu Simsek" }]
19 | dependencies = ["numpy", "typing-extensions"]
20 | requires-python = ">=3.9"
21 | classifiers = [
22 |   "Programming Language :: Rust",
23 |   "Programming Language :: Python :: 3",
24 |   "Programming Language :: Python :: 3.9",
25 |   "Programming Language :: Python :: 3.10",
26 |   "Programming Language :: Python :: 3.11",
27 |   "Programming Language :: Python :: 3.12",
28 |   "Programming Language :: Python :: 3.13",
29 | ]
30 | 
31 | [project.optional-dependencies]
32 | dev = ["black", "pandas", "polars", "pyarrow", "maturin", "pytest", "seaborn", "scikit-learn", "mkdocs-material", "mkdocstrings[python]", "mkdocs-autorefs", "ruff"]
33 | 
34 | [tool.maturin]
35 | sdist-include = ["LICENSE", "README.md"]
36 | python-source = "python"
37 | module-name = "perpetual.perpetual"
38 | 
39 | [tool.ruff]
40 | # Never enforce `E501` (line length violations).
41 | ignore = ["E501"]
42 | 
43 | [tool.isort]
44 | profile = "black"
45 | 


--------------------------------------------------------------------------------
/python-package/python/perpetual/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from perpetual.booster import PerpetualBooster
4 | 
5 | 
6 | __all__ = ["PerpetualBooster"]
7 | 


--------------------------------------------------------------------------------
/python-package/python/perpetual/data.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Iterable, Optional, Union
 3 | 
 4 | 
 5 | @dataclass
 6 | class Node:
 7 |     """Dataclass representation of a node, this represents all of the fields present in a tree node."""
 8 | 
 9 |     num: int
10 |     weight_value: float
11 |     hessian_sum: float
12 |     depth: int
13 |     split_value: float
14 |     split_feature: Union[str, int]
15 |     split_gain: float
16 |     missing_node: int
17 |     left_child: int
18 |     right_child: int
19 |     is_leaf: bool
20 |     node_type: str
21 |     parent_node: int
22 |     generalization: Optional[float]
23 |     left_cats: Optional[Iterable]
24 |     right_cats: Optional[Iterable]
25 |     count: int
26 | 


--------------------------------------------------------------------------------
/python-package/python/perpetual/serialize.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | from abc import ABC, abstractmethod
 5 | from ast import literal_eval
 6 | from dataclasses import dataclass
 7 | from typing import Dict, Generic, List, Tuple, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import numpy.typing as npt
11 | 
12 | T = TypeVar("T")
13 | 
14 | 
15 | class BaseSerializer(ABC, Generic[T]):
16 |     @abstractmethod
17 |     def serialize(self, obj: T) -> str:
18 |         """serialize method - should take an object and return a string"""
19 | 
20 |     @abstractmethod
21 |     def deserialize(self, obj_repr: str) -> T:
22 |         """deserialize method - should take a string and return original object"""
23 | 
24 | 
25 | Scaler = Union[int, float, str]
26 | 
27 | 
28 | class ScalerSerializer(BaseSerializer[Scaler]):
29 |     def serialize(self, obj: Scaler) -> str:
30 |         if isinstance(obj, str):
31 |             obj_ = f"'{obj}'"
32 |         else:
33 |             obj_ = str(obj)
34 |         return obj_
35 | 
36 |     def deserialize(self, obj_repr: str) -> Scaler:
37 |         return literal_eval(node_or_string=obj_repr)
38 | 
39 | 
40 | ObjectItem = Union[
41 |     List[Scaler],
42 |     Dict[str, Scaler],
43 |     Scaler,
44 | ]
45 | 
46 | 
47 | class ObjectSerializer(BaseSerializer[ObjectItem]):
48 |     def serialize(self, obj: ObjectItem) -> str:
49 |         return json.dumps(obj)
50 | 
51 |     def deserialize(self, obj_repr: str) -> ObjectItem:
52 |         return json.loads(obj_repr)
53 | 
54 | 
55 | @dataclass
56 | class NumpyData:
57 |     array: Union[List[float], List[int]]
58 |     dtype: str
59 |     shape: Tuple[int, ...]
60 | 
61 | 
62 | class NumpySerializer(BaseSerializer[npt.NDArray]):
63 |     def serialize(self, obj: npt.NDArray) -> str:
64 |         return json.dumps(
65 |             {"array": obj.tolist(), "dtype": str(obj.dtype), "shape": obj.shape}
66 |         )
67 | 
68 |     def deserialize(self, obj_repr: str) -> npt.NDArray:
69 |         data = NumpyData(**json.loads(obj_repr))
70 |         a = np.array(data.array, dtype=data.dtype)  # type: ignore
71 |         if len(data.shape) == 1:
72 |             return a
73 |         else:
74 |             return a.reshape(data.shape)
75 | 


--------------------------------------------------------------------------------
/python-package/python/perpetual/types.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing_extensions import Self
  3 | from typing import Any, Dict, Iterable, Protocol, Set
  4 | 
  5 | 
  6 | class BoosterType(Protocol):
  7 |     monotone_constraints: Dict[int, int]
  8 |     terminate_missing_features: Set[int]
  9 |     number_of_trees: int
 10 |     base_score: float
 11 | 
 12 |     def fit(
 13 |         self,
 14 |         flat_data: np.ndarray,
 15 |         rows: int,
 16 |         cols: int,
 17 |         y: np.ndarray,
 18 |         budget: float,
 19 |         sample_weight: np.ndarray,
 20 |         parallel: bool = False,
 21 |     ):
 22 |         """Fit method"""
 23 | 
 24 |     def predict(
 25 |         self,
 26 |         flat_data: np.ndarray,
 27 |         rows: int,
 28 |         cols: int,
 29 |         parallel: bool = True,
 30 |     ) -> np.ndarray:
 31 |         """predict method"""
 32 | 
 33 |     def predict_proba(
 34 |         self,
 35 |         flat_data: np.ndarray,
 36 |         rows: int,
 37 |         cols: int,
 38 |         parallel: bool = True,
 39 |     ) -> np.ndarray:
 40 |         """predict probabilities method"""
 41 | 
 42 |     def predict_contributions(
 43 |         self,
 44 |         flat_data: np.ndarray,
 45 |         rows: int,
 46 |         cols: int,
 47 |         method: str,
 48 |         parallel: bool = True,
 49 |     ) -> np.ndarray:
 50 |         """method"""
 51 | 
 52 |     def value_partial_dependence(
 53 |         self,
 54 |         feature: int,
 55 |         value: float,
 56 |     ) -> float:
 57 |         """pass"""
 58 | 
 59 |     def calculate_feature_importance(
 60 |         self,
 61 |         method: str,
 62 |         normalize: bool,
 63 |     ) -> Dict[int, float]:
 64 |         """pass"""
 65 | 
 66 |     def text_dump(self) -> Iterable[str]:
 67 |         """pass"""
 68 | 
 69 |     @classmethod
 70 |     def load_booster(cls, path: str) -> Self:
 71 |         """pass"""
 72 | 
 73 |     def save_booster(self, path: str):
 74 |         """pass"""
 75 | 
 76 |     @classmethod
 77 |     def from_json(cls, json_str: str) -> Self:
 78 |         """pass"""
 79 | 
 80 |     def json_dump(self) -> str:
 81 |         """pass"""
 82 | 
 83 |     def get_params(self) -> Dict[str, Any]:
 84 |         """pass"""
 85 | 
 86 |     def insert_metadata(self, key: str, value: str) -> None:
 87 |         """pass"""
 88 | 
 89 |     def get_metadata(self, key: str) -> str:
 90 |         """pass"""
 91 | 
 92 | 
 93 | class MultiOutputBoosterType(Protocol):
 94 |     monotone_constraints: Dict[int, int]
 95 |     terminate_missing_features: Set[int]
 96 |     number_of_trees: Iterable[int]
 97 |     base_score: Iterable[float]
 98 | 
 99 |     def fit(
100 |         self,
101 |         flat_data: np.ndarray,
102 |         rows: int,
103 |         cols: int,
104 |         y: np.ndarray,
105 |         budget: float,
106 |         sample_weight: np.ndarray,
107 |         parallel: bool = False,
108 |     ):
109 |         """Fit method"""
110 | 
111 |     def predict(
112 |         self,
113 |         flat_data: np.ndarray,
114 |         rows: int,
115 |         cols: int,
116 |         parallel: bool = True,
117 |     ) -> np.ndarray:
118 |         """predict method"""
119 | 
120 |     def predict_proba(
121 |         self,
122 |         flat_data: np.ndarray,
123 |         rows: int,
124 |         cols: int,
125 |         parallel: bool = True,
126 |     ) -> np.ndarray:
127 |         """predict probabilities method"""
128 | 
129 |     @classmethod
130 |     def load_booster(cls, path: str) -> Self:
131 |         """pass"""
132 | 
133 |     def save_booster(self, path: str):
134 |         """pass"""
135 | 
136 |     @classmethod
137 |     def from_json(cls, json_str: str) -> Self:
138 |         """pass"""
139 | 
140 |     def json_dump(self) -> str:
141 |         """pass"""
142 | 
143 |     def get_params(self) -> Dict[str, Any]:
144 |         """pass"""
145 | 
146 |     def insert_metadata(self, key: str, value: str) -> None:
147 |         """pass"""
148 | 
149 |     def get_metadata(self, key: str) -> str:
150 |         """pass"""
151 | 


--------------------------------------------------------------------------------
/python-package/python/perpetual/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import numpy as np
  3 | from typing import Dict, Iterable, List, Optional, Tuple
  4 | 
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | def type_df(df):
 10 |     library_name = type(df).__module__.split(".")[0]
 11 |     if type(df).__name__ == "DataFrame":
 12 |         if library_name == "pandas":
 13 |             return "pandas_df"
 14 |         elif library_name == "polars":
 15 |             return "polars_df"
 16 |     elif library_name == "numpy":
 17 |         return "numpy"
 18 |     else:
 19 |         return ""
 20 | 
 21 | 
 22 | def type_series(y):
 23 |     library_name = type(y).__module__.split(".")[0]
 24 |     if type(y).__name__ == "Series":
 25 |         if library_name == "pandas":
 26 |             return "pandas_series"
 27 |         elif library_name == "polars":
 28 |             return "polars_series"
 29 |     elif library_name == "numpy":
 30 |         return "numpy"
 31 |     else:
 32 |         return ""
 33 | 
 34 | 
 35 | def convert_input_array(x, objective, is_target=False) -> np.ndarray:
 36 |     classes_ = []
 37 | 
 38 |     if type(x).__module__.split(".")[0] == "numpy":
 39 |         if len(x.shape) == 2:
 40 |             classes_, x_, *_ = convert_input_frame(x, None, 1000)
 41 |         else:
 42 |             x_ = x
 43 |     elif type_series(x) == "pandas_series":
 44 |         x_ = x.to_numpy()
 45 |     elif type_series(x) == "polars_series":
 46 |         x_ = x.to_numpy(allow_copy=False)
 47 |     elif type_df(x) == "polars_df" or type_df(x) == "pandas_df":
 48 |         classes_, x_, *_ = convert_input_frame(x, None, 1000)
 49 |     else:
 50 |         x_ = x.to_numpy()
 51 | 
 52 |     if is_target and objective == "LogLoss" and len(x_.shape) == 1:
 53 |         classes_ = np.unique(x_)
 54 |         x_index = np.array([np.where(classes_ == i) for i in x_])
 55 |         if len(classes_) > 2:
 56 |             x_ = np.squeeze(np.eye(len(classes_))[x_index])
 57 | 
 58 |     if not np.issubdtype(x_.dtype, "float64"):
 59 |         x_ = x_.astype(dtype="float64", copy=False)
 60 | 
 61 |     if len(x_.shape) == 2:
 62 |         x_ = x_.ravel(order="F")
 63 | 
 64 |     return x_, classes_
 65 | 
 66 | 
 67 | def convert_input_frame(
 68 |     X,
 69 |     categorical_features,
 70 |     max_cat,
 71 | ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
 72 |     """Convert data to format needed by booster.
 73 | 
 74 |     Returns:
 75 |         Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
 76 |     """
 77 |     categorical_features_ = None
 78 |     if type_df(X) == "pandas_df":
 79 |         X_ = X.to_numpy()
 80 |         features_ = X.columns.to_list()
 81 |         if categorical_features == "auto":
 82 |             categorical_columns = X.select_dtypes(include=["category"]).columns.tolist()
 83 |             categorical_features_ = [
 84 |                 features_.index(c) for c in categorical_columns
 85 |             ] or None
 86 |     elif type_df(X) == "polars_df":
 87 |         import polars.selectors as cs
 88 | 
 89 |         try:
 90 |             X_ = X.to_numpy(allow_copy=False)
 91 |         except RuntimeError:
 92 |             X_ = X.to_numpy(allow_copy=True)
 93 | 
 94 |         features_ = X.columns
 95 |         if categorical_features == "auto":
 96 |             categorical_columns = X.select(cs.categorical()).columns
 97 |             categorical_features_ = [
 98 |                 features_.index(c) for c in categorical_columns
 99 |             ] or None
100 |     else:
101 |         # Assume it's a numpy array.
102 |         X_ = X
103 |         features_ = list(map(str, range(X_.shape[1])))
104 | 
105 |     if (
106 |         categorical_features
107 |         and all(isinstance(s, int) for s in categorical_features)
108 |         and isinstance(categorical_features, list)
109 |     ):
110 |         categorical_features_ = categorical_features
111 |     elif (
112 |         categorical_features
113 |         and all(isinstance(s, str) for s in categorical_features)
114 |         and isinstance(categorical_features, list)
115 |     ):
116 |         categorical_features_ = [features_.index(c) for c in categorical_features]
117 | 
118 |     cat_mapping = {}  # key: feature_name, value: ordered category names
119 |     cat_to_num = []
120 |     if categorical_features_:
121 |         for i in categorical_features_:
122 |             categories, inversed = np.unique(X_[:, i].astype(str), return_inverse=True)
123 | 
124 |             categories = list(categories)
125 |             if "nan" in categories:
126 |                 categories.remove("nan")
127 |             categories.insert(0, "nan")
128 | 
129 |             inversed = inversed + 1.0
130 | 
131 |             if len(categories) > max_cat:
132 |                 cat_to_num.append(i)
133 |                 logger.warning(
134 |                     f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
135 |                 )
136 | 
137 |             feature_name = features_[i]
138 |             cat_mapping[feature_name] = categories
139 |             ind_nan = len(categories)
140 |             inversed[inversed == ind_nan] = np.nan
141 |             X_[:, i] = inversed
142 | 
143 |         categorical_features_ = [
144 |             x for x in categorical_features_ if x not in cat_to_num
145 |         ]
146 | 
147 |         logger.info(f"Categorical features: {categorical_features_}")
148 |         logger.info(f"Mapping of categories: {cat_mapping}")
149 | 
150 |     if not np.issubdtype(X_.dtype, "float64"):
151 |         X_ = X_.astype(dtype="float64", copy=False)
152 |     flat_data = X_.ravel(order="F")
153 |     rows, cols = X_.shape
154 | 
155 |     if isinstance(categorical_features_, list):
156 |         categorical_features_ = set(categorical_features_)
157 | 
158 |     return features_, flat_data, rows, cols, categorical_features_, cat_mapping
159 | 
160 | 
161 | def transform_input_frame(X, cat_mapping) -> Tuple[List[str], np.ndarray, int, int]:
162 |     """Convert data to format needed by booster.
163 | 
164 |     Returns:
165 |         Tuple[List[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns
166 |     """
167 |     if type_df(X) == "pandas_df":
168 |         X_ = X.to_numpy()
169 |         features_ = X.columns.to_list()
170 |     elif type_df(X) == "polars_df":
171 |         try:
172 |             X_ = X.to_numpy(allow_copy=False)
173 |         except RuntimeError:
174 |             X_ = X.to_numpy(allow_copy=True)
175 |         features_ = X.columns
176 |     else:
177 |         # Assume it's a numpy array.
178 |         X_ = X
179 |         features_ = list(map(str, range(X_.shape[1])))
180 | 
181 |     if cat_mapping:
182 |         for feature_name, categories in cat_mapping.items():
183 |             feature_index = features_.index(feature_name)
184 |             cats = categories.copy()
185 |             cats.remove("nan")
186 |             x_enc = np.searchsorted(cats, X_[:, feature_index].astype(str))
187 |             x_enc = x_enc + 1.0
188 |             ind_nan = len(categories)
189 |             x_enc[x_enc == ind_nan] = np.nan
190 |             X_[:, feature_index] = x_enc
191 | 
192 |     if not np.issubdtype(X_.dtype, "float64"):
193 |         X_ = X_.astype(dtype="float64", copy=False)
194 |     flat_data = X_.ravel(order="F")
195 |     rows, cols = X_.shape
196 | 
197 |     return features_, flat_data, rows, cols
198 | 
199 | 
200 | CONTRIBUTION_METHODS = {
201 |     "weight": "Weight",
202 |     "Weight": "Weight",
203 |     "average": "Average",
204 |     "Average": "Average",
205 |     "branch-difference": "BranchDifference",
206 |     "branchdifference": "BranchDifference",
207 |     "BranchDifference": "BranchDifference",
208 |     "midpoint-difference": "MidpointDifference",
209 |     "midpointdifference": "MidpointDifference",
210 |     "MidpointDifference": "MidpointDifference",
211 |     "mode-difference": "ModeDifference",
212 |     "modedifference": "ModeDifference",
213 |     "ModeDifference": "ModeDifference",
214 |     "ProbabilityChange": "ProbabilityChange",
215 |     "probabilitychange": "ProbabilityChange",
216 |     "probability-change": "ProbabilityChange",
217 | }
218 | 


--------------------------------------------------------------------------------
/python-package/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod booster;
 2 | mod multi_output;
 3 | mod utils;
 4 | 
 5 | use crate::booster::PerpetualBooster;
 6 | use crate::multi_output::MultiOutputBooster;
 7 | use crate::utils::percentiles;
 8 | use crate::utils::print_matrix;
 9 | use pyo3::prelude::*;
10 | 
11 | #[pymodule]
12 | fn perpetual(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
13 |     pyo3_log::init();
14 | 
15 |     m.add_function(wrap_pyfunction!(print_matrix, m)?)?;
16 |     m.add_function(wrap_pyfunction!(percentiles, m)?)?;
17 | 
18 |     m.add_class::<PerpetualBooster>()?;
19 |     m.add_class::<MultiOutputBooster>()?;
20 | 
21 |     Ok(())
22 | }
23 | 


--------------------------------------------------------------------------------
/python-package/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use numpy::IntoPyArray;
 2 | use numpy::PyArray1;
 3 | use numpy::PyReadonlyArray1;
 4 | use perpetual_rs::constraints::{Constraint, ConstraintMap};
 5 | use perpetual_rs::data::Matrix;
 6 | use perpetual_rs::utils::percentiles as crate_percentiles;
 7 | use pyo3::exceptions::PyValueError;
 8 | use pyo3::prelude::*;
 9 | use std::collections::HashMap;
10 | 
11 | pub fn int_map_to_constraint_map(int_map: HashMap<usize, i8>) -> PyResult<ConstraintMap> {
12 |     let mut constraints: ConstraintMap = HashMap::new();
13 |     for (f, c) in int_map.iter() {
14 |         let c_ = match c {
15 |             -1 => Ok(Constraint::Negative),
16 |             1 => Ok(Constraint::Positive),
17 |             0 => Ok(Constraint::Unconstrained),
18 |             _ => Err(PyValueError::new_err(format!(
19 |                 "Valid monotone constraints are -1, 1 or 0, but '{}' was provided for feature number {}.",
20 |                 c, f
21 |             ))),
22 |         }?;
23 |         constraints.insert(*f, c_);
24 |     }
25 |     Ok(constraints)
26 | }
27 | 
28 | pub fn to_value_error<T, E: std::fmt::Display>(value: Result<T, E>) -> Result<T, PyErr> {
29 |     match value {
30 |         Ok(v) => Ok(v),
31 |         Err(e) => Err(PyValueError::new_err(e.to_string())),
32 |     }
33 | }
34 | #[pyfunction]
35 | pub fn print_matrix(x: PyReadonlyArray1<f32>, rows: usize, cols: usize) -> PyResult<()> {
36 |     let m = Matrix::new(x.as_slice()?, rows, cols);
37 |     println!("{}", m);
38 |     Ok(())
39 | }
40 | 
41 | #[pyfunction]
42 | pub fn percentiles<'py>(
43 |     py: Python<'py>,
44 |     v: PyReadonlyArray1<f64>,
45 |     sample_weight: PyReadonlyArray1<f64>,
46 |     percentiles: PyReadonlyArray1<f64>,
47 | ) -> PyResult<Bound<'py, PyArray1<f64>>> {
48 |     let v_ = v.as_slice()?;
49 |     let sample_weight_ = sample_weight.as_slice()?;
50 |     let percentiles_ = percentiles.as_slice()?;
51 |     let p = crate_percentiles(v_, sample_weight_, percentiles_);
52 |     Ok(p.into_pyarray_bound(py))
53 | }
54 | 


--------------------------------------------------------------------------------
/python-package/tests/test_multi_output.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from perpetual import PerpetualBooster
 4 | 
 5 | 
 6 | def test_multi_output():
 7 |     X = pd.read_csv("../resources/cover_types_train.csv", index_col=False)
 8 |     X = X.sample(n=10000, random_state=0)
 9 |     y = np.array(X.pop("Cover_Type"))
10 |     X_test = pd.read_csv("../resources/cover_types_test.csv", index_col=False)
11 |     y_test = np.array(X_test.pop("Cover_Type"))
12 |     model = PerpetualBooster(iteration_limit=40, memory_limit=1.0)
13 |     model.fit(X, y)
14 |     pred_test = model.predict(X_test)
15 |     proba_test = model.predict_proba(X_test)
16 |     log_odds_test = model.predict_log_proba(X_test)
17 |     assert not np.isnan(pred_test).any()
18 |     assert not np.isnan(proba_test).any()
19 |     assert not np.isnan(log_odds_test).any()
20 |     assert np.allclose(np.sum(proba_test, axis=1), np.ones(proba_test.shape[0]))
21 |     assert np.allclose(proba_test.shape, (len(X_test), len(np.unique(y_test))))
22 |     assert set(y_test) == set(pred_test)
23 | 


--------------------------------------------------------------------------------
/python-package/tests/test_save_load.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pytest
  5 | import perpetual
  6 | from perpetual import PerpetualBooster
  7 | 
  8 | 
  9 | def X_y_so():
 10 |     df = pd.read_csv("../resources/titanic.csv")
 11 |     X = df.select_dtypes("number").drop(columns="survived").reset_index(drop=True)
 12 |     y = df["survived"]
 13 |     return X, y
 14 | 
 15 | 
 16 | def X_y_mo():
 17 |     X = pd.read_csv("../resources/cover_types_train.csv", index_col=False)
 18 |     X = X.sample(n=1000, random_state=0)
 19 |     X.dropna(inplace=True)
 20 |     X = X.loc[:, (X != X.iloc[0]).any()]
 21 |     y = X.pop("Cover_Type")
 22 |     return X, y
 23 | 
 24 | 
 25 | def pickle_booster(model: PerpetualBooster, path: str) -> None:
 26 |     with open(path, "wb") as file:
 27 |         pickle.dump(model, file)
 28 | 
 29 | 
 30 | def unpickle_booster(path: str) -> PerpetualBooster:
 31 |     with open(path, "rb") as file:
 32 |         return pickle.load(file)
 33 | 
 34 | 
 35 | def save_booster(model: PerpetualBooster, path: str) -> None:
 36 |     model.save_booster(path)
 37 | 
 38 | 
 39 | def load_booster(path: str) -> PerpetualBooster:
 40 |     return PerpetualBooster.load_booster(path)
 41 | 
 42 | 
 43 | @pytest.mark.parametrize("X_y", [X_y_mo, X_y_so])
 44 | @pytest.mark.parametrize(
 45 |     "load_func,save_func",
 46 |     [(unpickle_booster, pickle_booster), (load_booster, save_booster)],
 47 | )
 48 | class TestSaveLoadFunctions:
 49 |     def test_booster_metadata(self, X_y, tmp_path, load_func, save_func):
 50 |         f64_model_path = tmp_path / "modelf64_sl.json"
 51 |         X, y = X_y()
 52 |         model = PerpetualBooster(
 53 |             objective="SquaredLoss", iteration_limit=10, memory_limit=1.0
 54 |         )
 55 |         save_func(model, f64_model_path)
 56 |         model.json_dump()
 57 |         model.fit(X, y)
 58 |         preds = model.predict(X)
 59 |         save_func(model, f64_model_path)
 60 |         model.insert_metadata("test-info", "some-info")
 61 |         assert model.get_metadata("test-info") == "some-info"
 62 |         save_func(model, f64_model_path)
 63 | 
 64 |         loaded = load_func(f64_model_path)
 65 |         assert loaded.get_metadata("test-info") == "some-info"
 66 | 
 67 |         with pytest.raises(KeyError):
 68 |             loaded.get_metadata("No-key")
 69 | 
 70 |         loaded_dict = loaded.__dict__
 71 |         model_dict = model.__dict__
 72 | 
 73 |         assert sorted(loaded_dict.keys()) == sorted(model_dict.keys())
 74 |         for k, v in loaded_dict.items():
 75 |             c_v = model_dict[k]
 76 |             if isinstance(v, float):
 77 |                 if np.isnan(v):
 78 |                     assert np.isnan(c_v)
 79 |                 else:
 80 |                     assert np.allclose(v, c_v)
 81 |             elif isinstance(v, perpetual.booster.CratePerpetualBooster) or isinstance(
 82 |                 v, perpetual.booster.CrateMultiOutputBooster
 83 |             ):
 84 |                 assert isinstance(
 85 |                     c_v, perpetual.booster.CratePerpetualBooster
 86 |                 ) or isinstance(v, perpetual.booster.CrateMultiOutputBooster)
 87 |             else:
 88 |                 print("else_block:")
 89 |                 print(k)
 90 |                 print(v)
 91 |                 print(c_v)
 92 |                 assert v == c_v, k
 93 |         loaded_preds = loaded.predict(X)
 94 |         assert np.allclose(preds, loaded_preds)
 95 | 
 96 |     def test_booster_saving(self, X_y, tmp_path, load_func, save_func):
 97 |         # SquaredLoss
 98 |         f64_model_path = tmp_path / "modelf64_sl.json"
 99 |         X, y = X_y()
100 |         X = X
101 |         model = PerpetualBooster(
102 |             objective="SquaredLoss", iteration_limit=10, memory_limit=1.0
103 |         )
104 |         model.fit(X, y)
105 |         preds = model.predict(X)
106 |         save_func(model, f64_model_path)
107 |         model_loaded = load_func(f64_model_path)
108 |         assert all(preds == model_loaded.predict(X))
109 | 
110 |         # LogLoss
111 |         f64_model_path = tmp_path / "modelf64_ll.json"
112 |         X, y = X_y()
113 |         model = PerpetualBooster(
114 |             objective="LogLoss", iteration_limit=10, memory_limit=1.0
115 |         )
116 |         model.fit(X, y)
117 |         preds = model.predict(X)
118 |         save_func(model, f64_model_path)
119 |         model_loaded = load_func(f64_model_path)
120 |         assert model_loaded.feature_names_in_ == model.feature_names_in_
121 |         assert model_loaded.feature_names_in_ == X.columns.to_list()
122 |         assert all(preds == model_loaded.predict(X))
123 | 
124 |     def test_booster_saving_with_monotone_constraints(
125 |         self, X_y, tmp_path, load_func, save_func
126 |     ):
127 |         # squared loss
128 |         f64_model_path = tmp_path / "modelf64_sl.json"
129 |         X, y = X_y()
130 | 
131 |         def calculate_monotonicity(x, y):
132 |             correlation = x.corr(y)
133 |             if np.isnan(correlation):
134 |                 return 0  # Or another appropriate default value
135 |             else:
136 |                 return int(np.sign(correlation))
137 | 
138 |         mono_ = X.apply(lambda x: calculate_monotonicity(x, y)).to_dict()
139 | 
140 |         model = PerpetualBooster(
141 |             objective="SquaredLoss",
142 |             monotone_constraints=mono_,
143 |             iteration_limit=10,
144 |             memory_limit=1.0,
145 |         )
146 |         model.fit(X, y)
147 |         preds = model.predict(X)
148 |         save_func(model, f64_model_path)
149 |         model_loaded = load_func(f64_model_path)
150 |         assert model_loaded.feature_names_in_ == model.feature_names_in_
151 |         assert model_loaded.feature_names_in_ == X.columns.to_list()
152 |         assert all(preds == model_loaded.predict(X))
153 |         assert all(
154 |             [
155 |                 model.monotone_constraints[ft] == model_loaded.monotone_constraints[ft]
156 |                 for ft in model_loaded.feature_names_in_
157 |             ]
158 |         )
159 |         assert all(
160 |             [
161 |                 model.monotone_constraints[ft] == model_loaded.monotone_constraints[ft]
162 |                 for ft in model.feature_names_in_
163 |             ]
164 |         )
165 |         assert all(
166 |             [
167 |                 model.monotone_constraints[ft] == model_loaded.monotone_constraints[ft]
168 |                 for ft in mono_.keys()
169 |             ]
170 |         )
171 | 
172 |         # LogLoss
173 |         f64_model_path = tmp_path / "modelf64_ll.json"
174 |         X, y = X_y()
175 |         X = X
176 |         model = PerpetualBooster(
177 |             objective="LogLoss",
178 |             monotone_constraints=mono_,
179 |             iteration_limit=10,
180 |             memory_limit=1.0,
181 |         )
182 |         model.fit(X, y)
183 |         preds = model.predict(X)
184 |         save_func(model, f64_model_path)
185 |         model_loaded = load_func(f64_model_path)
186 |         assert all(preds == model_loaded.predict(X))
187 | 


--------------------------------------------------------------------------------
/python-package/tests/test_serialize.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from perpetual.serialize import (
 7 |     NumpySerializer,
 8 |     ObjectItem,
 9 |     ObjectSerializer,
10 |     Scaler,
11 |     ScalerSerializer,
12 | )
13 | 
14 | scaler_values = [
15 |     1,
16 |     1.0,
17 |     1.00101,
18 |     "a string",
19 |     True,
20 |     False,
21 |     None,
22 | ]
23 | 
24 | 
25 | @pytest.mark.parametrize("value", scaler_values)
26 | def test_scaler(value: Scaler):
27 |     serializer = ScalerSerializer()
28 |     r = serializer.serialize(value)
29 |     assert isinstance(r, str)
30 |     assert value == serializer.deserialize(r)
31 | 
32 | 
33 | object_values = [
34 |     [1, 2, 3],
35 |     [1.0, 4.0],
36 |     ["a", "b", "c"],
37 |     {"a": 1.0, "b": 2.0},
38 |     {"a": "test", "b": "what"},
39 |     *scaler_values,
40 | ]
41 | 
42 | 
43 | @pytest.mark.parametrize("value", object_values)
44 | def test_object(value: ObjectItem):
45 |     serializer = ObjectSerializer()
46 |     r = serializer.serialize(value)
47 |     assert isinstance(r, str)
48 |     assert value == serializer.deserialize(r)
49 | 
50 | 
51 | numpy_values = [
52 |     np.array([1.0, 2.23]),
53 |     np.array([1, 2, 3, 4, 5, 6]).reshape((2, 3)),
54 |     np.array([1, 2, 3, 4, 5, 6], dtype="int").reshape((2, 3)),
55 | ]
56 | 
57 | 
58 | @pytest.mark.parametrize("value", numpy_values)
59 | def test_numpy(value: np.ndarray):
60 |     serializer = NumpySerializer()
61 |     r = serializer.serialize(value)
62 |     assert isinstance(r, str)
63 |     assert np.array_equal(value, serializer.deserialize(r))
64 | 


--------------------------------------------------------------------------------
/resources/perp_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/perpetual-ml/perpetual/581262534205b6bc8fd85694359a33c8983e8918/resources/perp_logo.png


--------------------------------------------------------------------------------
/rust-toolchain:
--------------------------------------------------------------------------------
1 | nightly
2 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 120


--------------------------------------------------------------------------------
/scripts/make_resources.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import seaborn as sns
  3 | from sklearn.model_selection import train_test_split
  4 | from sklearn.datasets import fetch_california_housing, fetch_covtype, fetch_openml
  5 | from perpetual.utils import convert_input_frame, transform_input_frame
  6 | 
  7 | if __name__ == "__main__":
  8 |     df = sns.load_dataset("titanic")
  9 |     df.to_csv("resources/titanic.csv", index=False)
 10 |     
 11 |     X = df.select_dtypes("number").drop(columns=["survived"]).astype(float)
 12 |     y = df["survived"].astype(float)
 13 | 
 14 |     pd.Series(X.fillna(0).to_numpy().ravel(order="F")).to_csv(
 15 |         "resources/contiguous_no_missing.csv",
 16 |         index=False,
 17 |         header=False,
 18 |     )
 19 | 
 20 |     pd.Series(X.to_numpy().ravel(order="F")).to_csv(
 21 |         "resources/contiguous_with_missing.csv",
 22 |         index=False,
 23 |         header=False,
 24 |     )
 25 | 
 26 |     y.to_csv(
 27 |         "resources/performance.csv",
 28 |         index=False,
 29 |         header=False,
 30 |     )
 31 | 
 32 |     X.fare.to_csv(
 33 |         "resources/performance-fare.csv",
 34 |         index=False,
 35 |         header=False,
 36 |     )
 37 | 
 38 |     dfb = df.sample(
 39 |         100_000,
 40 |         random_state=0,
 41 |         replace=True,
 42 |     ).reset_index(drop=True)
 43 | 
 44 |     Xb = dfb.select_dtypes("number").drop(columns=["survived"]).astype(float)
 45 |     yb = dfb["survived"].astype(float)
 46 | 
 47 |     pd.Series(Xb.fillna(0).to_numpy().ravel(order="F")).to_csv(
 48 |         "resources/contiguous_no_missing_100k_samp_seed0.csv",
 49 |         index=False,
 50 |         header=False,
 51 |     )
 52 | 
 53 |     yb.to_csv(
 54 |         "resources/performance_100k_samp_seed0.csv",
 55 |         index=False,
 56 |         header=False,
 57 |     )
 58 | 
 59 |     data = fetch_california_housing(as_frame=True)
 60 |     data_train, data_test = train_test_split(data.frame, test_size=0.2, random_state=42)
 61 |     data_train.to_csv("resources/cal_housing_train.csv", index=False)
 62 |     data_test.to_csv("resources/cal_housing_test.csv", index=False)
 63 | 
 64 |     data = fetch_covtype(as_frame=True)
 65 |     data_train, data_test = train_test_split(data.frame, test_size=0.2, random_state=42)
 66 |     data_train.to_csv("resources/cover_types_train.csv", index=False)
 67 |     data_test.to_csv("resources/cover_types_test.csv", index=False)
 68 | 
 69 |     
 70 | 
 71 |   
 72 | 
 73 |     X = df.drop(columns=["survived"])
 74 |     y = df["survived"]
 75 | 
 76 |     X["sex"] = pd.get_dummies(X["sex"], drop_first=True, dtype=float).to_numpy()
 77 |     X["adult_male"] = pd.get_dummies(X["adult_male"], drop_first=True, dtype=float).to_numpy()
 78 |     X.drop(columns=["alive"], inplace=True)
 79 |     X["alone"] = pd.get_dummies(X["alone"], drop_first=True, dtype=float).to_numpy()
 80 |     cols = ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town']
 81 |     X[cols] = X[cols].astype('category')
 82 | 
 83 |     data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 84 | 
 85 |     features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto", 1000)
 86 |     features_, titanic_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping)
 87 | 
 88 |     data_test.to_csv("resources/titanic_test_df.csv", index=False)
 89 | 
 90 |     pd.Series(titanic_train_flat).to_csv("resources/titanic_train_flat.csv", index=False, header=False)
 91 |     pd.Series(titanic_test_flat).to_csv("resources/titanic_test_flat.csv", index=False, header=False)
 92 |     pd.Series(y_train).to_csv("resources/titanic_train_y.csv", index=False, header=False)
 93 |     pd.Series(y_test).to_csv("resources/titanic_test_y.csv", index=False, header=False)
 94 | 
 95 | 
 96 |     # https://www.openml.org/search?type=data&id=546&sort=runs&status=active
 97 |     df = fetch_openml(data_id=546)
 98 |     X = df.data
 99 |     y = df.target
100 |     features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto", 1000)
101 |     pd.Series(sensory_flat).to_csv("resources/sensory_flat.csv", index=False, header=False)
102 |     pd.Series(y).to_csv("resources/sensory_y.csv", index=False, header=False)
103 | 


--------------------------------------------------------------------------------
/scripts/remove-optional-deps.py:
--------------------------------------------------------------------------------
 1 | # Building with polars is sooo slow.
 2 | # It's only there for the example, so let's remove it
 3 | # in the regular build process.
 4 | # Requires toml package
 5 | import shutil
 6 | 
 7 | import toml
 8 | 
 9 | ct = toml.load("Cargo.toml")
10 | 
11 | del ct["dev-dependencies"]
12 | del ct["bench"]
13 | 
14 | with open("Cargo.toml", "w") as file:
15 |     toml.dump(ct, file)
16 | 
17 | # Also delete the rust example.
18 | shutil.rmtree("examples")
19 | 


--------------------------------------------------------------------------------
/scripts/run-python-tests.ps1:
--------------------------------------------------------------------------------
1 | Set-Location python-package
2 | python -m black python/perpetual/
3 | python -m black tests/
4 | python -m black examples/
5 | maturin develop --release
6 | pytest .
7 | Set-Location ..


--------------------------------------------------------------------------------
/scripts/run-python-tests.sh:
--------------------------------------------------------------------------------
1 | cd python-package
2 | python -m black python/perpetual/
3 | python -m black tests/
4 | python -m black examples/
5 | maturin develop --release
6 | pytest .
7 | cd ..


--------------------------------------------------------------------------------
/scripts/run-single-python-test.ps1:
--------------------------------------------------------------------------------
1 | Set-Location python-package
2 | python -m black python/perpetual/
3 | python -m black tests/
4 | python -m black examples/
5 | maturin develop --release
6 | pytest tests/test_booster.py::test_predict_nodes -s
7 | Set-Location ..


--------------------------------------------------------------------------------
/scripts/uv_script.ps1:
--------------------------------------------------------------------------------
1 | Set-Location python-package
2 | uv sync
3 | .venv\Scripts\activate
4 | uv pip install pip
5 | uv pip install -r pyproject.toml --extra dev
6 | Set-Location ..
7 | 


--------------------------------------------------------------------------------
/scripts/uv_script.sh:
--------------------------------------------------------------------------------
1 | cd python-package
2 | uv sync
3 | source .venv\Scripts\activate
4 | uv pip install pip
5 | uv pip install -r pyproject.toml --extra dev
6 | cd ..
7 | 


--------------------------------------------------------------------------------
/src/bin.rs:
--------------------------------------------------------------------------------
  1 | use std::{cell::UnsafeCell, cmp::Ordering};
  2 | 
  3 | use crate::data::FloatData;
  4 | 
  5 | use serde::{Deserialize, Serialize};
  6 | 
  7 | /// Struct to hold the information of a given bin.
  8 | #[derive(Debug, Deserialize, Serialize, Clone)]
  9 | pub struct Bin {
 10 |     pub num: u16,
 11 |     pub cut_value: f64,
 12 |     pub g_folded: [f32; 5],
 13 |     pub h_folded: Option<[f32; 5]>,
 14 |     pub counts: [usize; 5],
 15 | }
 16 | 
 17 | impl Bin {
 18 |     pub fn empty_const_hess(num: u16, cut_value: f64) -> Self {
 19 |         Bin {
 20 |             num,
 21 |             cut_value,
 22 |             g_folded: [f32::ZERO; 5],
 23 |             h_folded: None,
 24 |             counts: [0; 5],
 25 |         }
 26 |     }
 27 |     pub fn empty(num: u16, cut_value: f64) -> Self {
 28 |         Bin {
 29 |             num,
 30 |             cut_value,
 31 |             g_folded: [f32::ZERO; 5],
 32 |             h_folded: Some([f32::ZERO; 5]),
 33 |             counts: [0; 5],
 34 |         }
 35 |     }
 36 | 
 37 |     pub fn from_parent_child(root_bin: *mut Bin, child_bin: *mut Bin, update_bin: *mut Bin) {
 38 |         let rb = unsafe { root_bin.as_ref().unwrap() };
 39 |         let cb = unsafe { child_bin.as_ref().unwrap() };
 40 |         let ub = unsafe { update_bin.as_mut().unwrap() };
 41 |         for ((z, a), b) in ub.g_folded.iter_mut().zip(rb.g_folded).zip(cb.g_folded) {
 42 |             *z = a - b;
 43 |         }
 44 |         for ((z, a), b) in ub.counts.iter_mut().zip(rb.counts).zip(cb.counts) {
 45 |             *z = a - b;
 46 |         }
 47 | 
 48 |         match rb.h_folded {
 49 |             Some(_h_folded) => {
 50 |                 let h_f_iter = ub.h_folded.as_mut().unwrap().iter_mut();
 51 |                 for ((zval, aval), bval) in h_f_iter.zip(rb.h_folded.unwrap()).zip(cb.h_folded.unwrap()) {
 52 |                     *zval = aval - bval;
 53 |                 }
 54 |             }
 55 |             None => {
 56 |                 ub.h_folded = None;
 57 |             }
 58 |         };
 59 |     }
 60 | 
 61 |     pub fn from_parent_two_children(
 62 |         root_bin: *mut Bin,
 63 |         first_bin: *mut Bin,
 64 |         second_bin: *mut Bin,
 65 |         update_bin: *mut Bin,
 66 |     ) {
 67 |         let rb = unsafe { root_bin.as_ref().unwrap() };
 68 |         let fb = unsafe { first_bin.as_ref().unwrap() };
 69 |         let sb = unsafe { second_bin.as_ref().unwrap() };
 70 |         let ub = unsafe { update_bin.as_mut().unwrap() };
 71 |         for (((z, a), b), c) in ub
 72 |             .g_folded
 73 |             .iter_mut()
 74 |             .zip(rb.g_folded)
 75 |             .zip(fb.g_folded)
 76 |             .zip(sb.g_folded)
 77 |         {
 78 |             *z = a - b - c;
 79 |         }
 80 |         for (((z, a), b), c) in ub.counts.iter_mut().zip(rb.counts).zip(fb.counts).zip(sb.counts) {
 81 |             *z = a - b - c;
 82 |         }
 83 | 
 84 |         match rb.h_folded {
 85 |             Some(_h_folded) => {
 86 |                 let h_f_iter = ub.h_folded.as_mut().unwrap().iter_mut();
 87 |                 for (((z, a), b), c) in h_f_iter
 88 |                     .zip(rb.h_folded.unwrap())
 89 |                     .zip(fb.h_folded.unwrap())
 90 |                     .zip(sb.h_folded.unwrap())
 91 |                 {
 92 |                     *z = a - b - c;
 93 |                 }
 94 |             }
 95 |             None => {
 96 |                 ub.h_folded = None;
 97 |             }
 98 |         };
 99 |     }
100 | }
101 | 
102 | pub fn sort_cat_bins_by_num(histogram: &mut [&UnsafeCell<Bin>]) {
103 |     unsafe {
104 |         histogram.sort_unstable_by_key(|bin| bin.get().as_ref().unwrap().num);
105 |     }
106 | }
107 | 
108 | pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell<Bin>], is_const_hess: bool) {
109 |     unsafe {
110 |         if is_const_hess {
111 |             histogram.sort_unstable_by(|bin1, bin2| {
112 |                 let b1 = bin1.get().as_ref().unwrap();
113 |                 let b2 = bin2.get().as_ref().unwrap();
114 |                 if b1.num == 0 {
115 |                     return Ordering::Less;
116 |                 } else if b2.num == 0 {
117 |                     return Ordering::Greater;
118 |                 }
119 |                 let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.counts.iter().sum::<usize>() as f32;
120 |                 let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.counts.iter().sum::<usize>() as f32;
121 |                 div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
122 |             });
123 |         } else {
124 |             histogram.sort_unstable_by(|bin1, bin2| {
125 |                 let b1 = bin1.get().as_ref().unwrap();
126 |                 let b2 = bin2.get().as_ref().unwrap();
127 |                 if b1.num == 0 {
128 |                     return Ordering::Less;
129 |                 } else if b2.num == 0 {
130 |                     return Ordering::Greater;
131 |                 }
132 |                 let div1: f32 = b1.g_folded.iter().sum::<f32>() / b1.h_folded.unwrap().iter().sum::<f32>();
133 |                 let div2: f32 = b2.g_folded.iter().sum::<f32>() / b2.h_folded.unwrap().iter().sum::<f32>();
134 |                 div2.partial_cmp(&div1).unwrap_or(Ordering::Less)
135 |             });
136 |         }
137 |     }
138 | }
139 | 
140 | #[cfg(test)]
141 | mod tests {
142 |     use super::*;
143 | 
144 |     #[test]
145 |     fn test_bin() {
146 |         let mut root_bin = Bin::empty_const_hess(0, 0.0);
147 |         root_bin.counts = [10, 10, 10, 10, 10];
148 |         let mut child_bin = Bin::empty_const_hess(1, 0.0);
149 |         child_bin.counts = [9, 8, 7, 6, 5];
150 |         let mut update_bin = Bin::empty_const_hess(2, 0.0);
151 |         Bin::from_parent_child(
152 |             &mut root_bin as *mut Bin,
153 |             &mut child_bin as *mut Bin,
154 |             &mut update_bin as *mut Bin,
155 |         );
156 |         assert!(update_bin.counts == [1, 2, 3, 4, 5]);
157 |     }
158 | }
159 | 


--------------------------------------------------------------------------------
/src/binning.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashSet;
  2 | 
  3 | use crate::data::{FloatData, JaggedMatrix, Matrix};
  4 | use crate::errors::PerpetualError;
  5 | use crate::utils::{is_missing, map_bin, percentiles};
  6 | 
  7 | /// If there are fewer unique values than their are
  8 | /// percentiles, just return the unique values of the
  9 | /// vectors.
 10 | ///
 11 | /// * `v` - A numeric slice to calculate percentiles for.
 12 | /// * `sample_weight` - Instance weights for each row in the data.
 13 | fn percentiles_or_value<T>(v: &[T], sample_weight: &[T], pcts: &[T]) -> Vec<T>
 14 | where
 15 |     T: FloatData<T>,
 16 | {
 17 |     let mut v_u = v.to_owned();
 18 |     v_u.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
 19 |     v_u.dedup();
 20 |     if v_u.len() <= pcts.len() + 1 {
 21 |         v_u
 22 |     } else {
 23 |         percentiles(v, sample_weight, pcts)
 24 |     }
 25 | }
 26 | 
 27 | // We want to be able to bin our dataset into discrete buckets.
 28 | // First we will calculate percentiles and the number of unique values
 29 | // for each feature.
 30 | // Then we will bucket them into bins from 0 to N + 1 where N is the number
 31 | // of unique bin values created from the percentiles, and the very last
 32 | // bin is missing values.
 33 | // For now, we will just use usize, although, it would be good to see if
 34 | // we can use something smaller, u8 for instance.
 35 | // If we generated these cuts:
 36 | // [0.0, 7.8958, 14.4542, 31.0, 512.3292, inf]
 37 | // We would have a number with bins 0 (missing), 1 [MIN, 0.0), 2 (0.0, 7], 3 [], 4, 5
 38 | // a split that is [feature < 5] would translate to [feature < 31.0 ]
 39 | #[derive(Debug)]
 40 | pub struct BinnedData<T> {
 41 |     pub binned_data: Vec<u16>,
 42 |     pub cuts: JaggedMatrix<T>,
 43 |     pub nunique: Vec<usize>,
 44 | }
 45 | 
 46 | /// Convert a matrix of data, into a binned matrix.
 47 | ///
 48 | /// * `data` - Numeric data to be binned.
 49 | /// * `cuts` - A slice of Vectors, where the vectors are the corresponding
 50 | ///     cut values for each of the columns.
 51 | fn bin_matrix_from_cuts<T: FloatData<T>>(data: &Matrix<T>, cuts: &JaggedMatrix<T>, missing: &T) -> Vec<u16> {
 52 |     // loop through the matrix, binning the data.
 53 |     // We will determine the column we are in, by
 54 |     // using the modulo operator, on the record value.
 55 |     data.data
 56 |         .iter()
 57 |         .enumerate()
 58 |         .map(|(i, v)| {
 59 |             let col = i / data.rows;
 60 |             // This will always be smaller than u16::MAX so we
 61 |             // are good to just unwrap here.
 62 |             map_bin(cuts.get_col(col), v, missing).unwrap()
 63 |         })
 64 |         .collect()
 65 | }
 66 | 
 67 | /// Bin a numeric matrix.
 68 | ///
 69 | /// * `data` - A numeric matrix, of data to be binned.
 70 | /// * `sample_weight` - Instance weights for each row of the data.
 71 | /// * `nbins` - The number of bins each column should be binned into.
 72 | /// * `missing` - Float value to consider as missing.
 73 | pub fn bin_matrix(
 74 |     data: &Matrix<f64>,
 75 |     sample_weight: Option<&[f64]>,
 76 |     nbins: u16,
 77 |     missing: f64,
 78 |     cat_index: Option<&HashSet<usize>>,
 79 | ) -> Result<BinnedData<f64>, PerpetualError> {
 80 |     let mut pcts = Vec::new();
 81 |     let nbins_ = f64::from_u16(nbins);
 82 |     for i in 0..nbins {
 83 |         let v = f64::from_u16(i) / nbins_;
 84 |         pcts.push(v);
 85 |     }
 86 | 
 87 |     let s_w = vec![1.0; data.rows];
 88 |     let weight = match sample_weight {
 89 |         Some(sample_weight) => sample_weight,
 90 |         None => &s_w,
 91 |     };
 92 | 
 93 |     let to_remove = match cat_index {
 94 |         Some(cat_index) => HashSet::from_iter(cat_index),
 95 |         None => HashSet::new(),
 96 |     };
 97 |     let mut num_index: Vec<usize> = (0..data.cols).collect();
 98 |     num_index.retain(|e| !to_remove.contains(&(*e)));
 99 |     let num_index_set: HashSet<usize> = HashSet::from_iter(num_index);
100 | 
101 |     // First we need to generate the bins for each of the columns.
102 |     // We will loop through all of the columns, and generate the cuts.
103 |     let mut cuts = JaggedMatrix::new();
104 |     let mut nunique = Vec::new();
105 |     for i in 0..data.cols {
106 |         let (no_miss, w): (Vec<f64>, Vec<f64>) = data
107 |             .get_col(i)
108 |             .iter()
109 |             .zip(weight.iter())
110 |             // It is unrecoverable if they have provided missing values in
111 |             // the data other than the specificized missing.
112 |             .filter(|(v, _)| !is_missing(v, &missing))
113 |             .unzip();
114 |         assert_eq!(no_miss.len(), w.len());
115 | 
116 |         if num_index_set.contains(&i) {
117 |             let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts);
118 |             col_cuts.push(f64::MAX);
119 |             col_cuts.dedup();
120 |             // if col_cuts.len() < 2 {
121 |             //     return Err(PerpetualError::NoVariance(i));
122 |             // }
123 |             // There will be one less bins, then there are cuts.
124 |             // The first value will be for missing.
125 |             nunique.push(col_cuts.len());
126 |             let l = col_cuts.len();
127 |             cuts.data.extend(col_cuts);
128 |             let e = match cuts.ends.last() {
129 |                 Some(v) => v + l,
130 |                 None => l,
131 |             };
132 |             cuts.ends.push(e);
133 |         } else {
134 |             // There will be number of bins as many as number of categories. Number of bins for categorical features is not limited currently.
135 |             let col_categories: HashSet<u16> = HashSet::from_iter(no_miss.iter().map(|&e| e as u16));
136 |             let mut col_cuts: Vec<f64> = col_categories.iter().map(|&e| e as f64).collect();
137 |             col_cuts.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
138 |             col_cuts.push(f64::MAX);
139 |             nunique.push(col_cuts.len());
140 |             let l = col_cuts.len();
141 |             cuts.data.extend(col_cuts);
142 |             let e = match cuts.ends.last() {
143 |                 Some(v) => v + l,
144 |                 None => l,
145 |             };
146 |             cuts.ends.push(e);
147 |         }
148 |     }
149 | 
150 |     cuts.cols = cuts.ends.len();
151 |     cuts.n_records = cuts.ends.iter().sum();
152 | 
153 |     let binned_data = bin_matrix_from_cuts(data, &cuts, &missing);
154 | 
155 |     Ok(BinnedData {
156 |         binned_data,
157 |         cuts,
158 |         nunique,
159 |     })
160 | }
161 | 
162 | #[cfg(test)]
163 | mod tests {
164 |     use super::*;
165 |     use std::fs;
166 |     #[test]
167 |     fn test_bin_data() {
168 |         let file =
169 |             fs::read_to_string("resources/contiguous_no_missing.csv").expect("Something went wrong reading the file");
170 |         let data_vec: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
171 |         let data = Matrix::new(&data_vec, 891, 5);
172 |         let b = bin_matrix(&data, None, 10, f64::NAN, None).unwrap();
173 |         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
174 |         for column in 0..data.cols {
175 |             let mut b_compare = 1;
176 |             for cuts in b.cuts.get_col(column).windows(2) {
177 |                 let c1 = cuts[0];
178 |                 let c2 = cuts[1];
179 |                 let mut n_v = 0;
180 |                 let mut n_b = 0;
181 |                 for (bin, value) in bdata.get_col(column).iter().zip(data.get_col(column)) {
182 |                     if *bin == b_compare {
183 |                         n_b += 1;
184 |                     }
185 |                     if (c1 <= *value) && (*value < c2) {
186 |                         n_v += 1;
187 |                     }
188 |                 }
189 |                 assert_eq!(n_v, n_b);
190 |                 b_compare += 1;
191 |             }
192 |         }
193 |         println!("{:?}", b);
194 |     }
195 | 
196 |     #[test]
197 |     fn test_bin_data_categorical() {
198 |         let file =
199 |             fs::read_to_string("resources/titanic_train_flat.csv").expect("Something went wrong reading the file");
200 |         let n_rows = 712;
201 |         let n_columns = 13;
202 |         let n_lines = n_columns * n_rows;
203 |         let data_vec: Vec<f64> = file
204 |             .lines()
205 |             .take(n_lines)
206 |             .map(|x| x.trim().parse::<f64>().unwrap_or(f64::NAN))
207 |             .collect();
208 |         let data = Matrix::new(&data_vec, n_rows, n_columns);
209 |         let cat_index = HashSet::from([0, 3, 4, 6, 7, 8, 10, 11]);
210 | 
211 |         let b = bin_matrix(&data, None, 256, f64::NAN, Some(&cat_index)).unwrap();
212 |         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
213 | 
214 |         println!("{:?}", b.cuts);
215 |         println!("{:?}", b.nunique);
216 | 
217 |         for column in 0..data.cols {
218 |             let mut b_compare = 1;
219 |             for cuts in b.cuts.get_col(column).windows(2) {
220 |                 let c1 = cuts[0];
221 |                 let c2 = cuts[1];
222 |                 let mut n_v = 0;
223 |                 let mut n_b = 0;
224 |                 for (bin, value) in bdata.get_col(column).iter().zip(data.get_col(column)) {
225 |                     if *bin == b_compare {
226 |                         n_b += 1;
227 |                     }
228 |                     if (c1 <= *value) && (*value < c2) {
229 |                         n_v += 1;
230 |                     }
231 |                 }
232 |                 assert_eq!(n_v, n_b);
233 |                 b_compare += 1;
234 |             }
235 |         }
236 |     }
237 | }
238 | 


--------------------------------------------------------------------------------
/src/booster/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod booster;
2 | pub mod multi_output;
3 | pub mod predict;
4 | pub mod setters;
5 | 


--------------------------------------------------------------------------------
/src/booster/setters.rs:
--------------------------------------------------------------------------------
  1 | use super::booster::MissingNodeTreatment;
  2 | use crate::{constraints::ConstraintMap, objective_functions::Objective, PerpetualBooster};
  3 | use std::collections::HashSet;
  4 | 
  5 | impl PerpetualBooster {
  6 |     // Set methods for paramters
  7 | 
  8 |     /// Set the objective on the booster.
  9 |     /// * `objective` - The objective type of the booster.
 10 |     pub fn set_objective(mut self, objective: Objective) -> Self {
 11 |         self.objective = objective;
 12 |         self
 13 |     }
 14 | 
 15 |     /// Set the budget on the booster.
 16 |     /// * `budget` - Budget to fit the booster.
 17 |     pub fn set_budget(mut self, budget: f32) -> Self {
 18 |         self.budget = budget;
 19 |         self
 20 |     }
 21 | 
 22 |     /// Set the base_score on the booster.
 23 |     /// * `base_score` - The base score of the booster.
 24 |     pub fn set_base_score(mut self, base_score: f64) -> Self {
 25 |         self.base_score = base_score;
 26 |         self
 27 |     }
 28 | 
 29 |     /// Set the number of bins on the booster.
 30 |     /// * `max_bin` - Number of bins to calculate to partition the data. Setting this to
 31 |     ///   a smaller number, will result in faster training time, while potentially sacrificing
 32 |     ///   accuracy. If there are more bins, than unique values in a column, all unique values
 33 |     ///   will be used.
 34 |     pub fn set_max_bin(mut self, max_bin: u16) -> Self {
 35 |         self.max_bin = max_bin;
 36 |         self
 37 |     }
 38 | 
 39 |     /// Set the number of threads on the booster.
 40 |     /// * `num_threads` - Set the number of threads to be used during training.
 41 |     pub fn set_num_threads(mut self, num_threads: Option<usize>) -> Self {
 42 |         self.num_threads = num_threads;
 43 |         self
 44 |     }
 45 | 
 46 |     /// Set the monotone_constraints on the booster.
 47 |     /// * `monotone_constraints` - The monotone constraints of the booster.
 48 |     pub fn set_monotone_constraints(mut self, monotone_constraints: Option<ConstraintMap>) -> Self {
 49 |         self.monotone_constraints = monotone_constraints;
 50 |         self
 51 |     }
 52 | 
 53 |     /// Set the force_children_to_bound_parent on the booster.
 54 |     /// * `force_children_to_bound_parent` - Set force children to bound parent.
 55 |     pub fn set_force_children_to_bound_parent(mut self, force_children_to_bound_parent: bool) -> Self {
 56 |         self.force_children_to_bound_parent = force_children_to_bound_parent;
 57 |         self
 58 |     }
 59 | 
 60 |     /// Set missing value of the booster
 61 |     /// * `missing` - Float value to consider as missing.
 62 |     pub fn set_missing(mut self, missing: f64) -> Self {
 63 |         self.missing = missing;
 64 |         self
 65 |     }
 66 | 
 67 |     /// Set the allow_missing_splits on the booster.
 68 |     /// * `allow_missing_splits` - Set if missing splits are allowed for the booster.
 69 |     pub fn set_allow_missing_splits(mut self, allow_missing_splits: bool) -> Self {
 70 |         self.allow_missing_splits = allow_missing_splits;
 71 |         self
 72 |     }
 73 | 
 74 |     /// Set create missing value of the booster
 75 |     /// * `create_missing_branch` - Bool specifying if missing should get it's own
 76 |     /// branch.
 77 |     pub fn set_create_missing_branch(mut self, create_missing_branch: bool) -> Self {
 78 |         self.create_missing_branch = create_missing_branch;
 79 |         self
 80 |     }
 81 | 
 82 |     /// Set the features where whose missing nodes should
 83 |     /// always be terminated.
 84 |     /// * `terminate_missing_features` - Hashset of the feature indices for the
 85 |     /// features that should always terminate the missing node, if create_missing_branch
 86 |     /// is true.
 87 |     pub fn set_terminate_missing_features(mut self, terminate_missing_features: HashSet<usize>) -> Self {
 88 |         self.terminate_missing_features = terminate_missing_features;
 89 |         self
 90 |     }
 91 | 
 92 |     /// Set the missing_node_treatment on the booster.
 93 |     /// * `missing_node_treatment` - The missing node treatment of the booster.
 94 |     pub fn set_missing_node_treatment(mut self, missing_node_treatment: MissingNodeTreatment) -> Self {
 95 |         self.missing_node_treatment = missing_node_treatment;
 96 |         self
 97 |     }
 98 | 
 99 |     /// Set the log iterations on the booster.
100 |     /// * `log_iterations` - The number of log iterations of the booster.
101 |     pub fn set_log_iterations(mut self, log_iterations: usize) -> Self {
102 |         self.log_iterations = log_iterations;
103 |         self
104 |     }
105 | 
106 |     /// Set the log iterations on the booster.
107 |     /// * `log_iterations` - The number of log iterations of the booster.
108 |     pub fn set_ref_log_iterations(mut self, log_iterations: usize) -> Self {
109 |         self.log_iterations = log_iterations;
110 |         self
111 |     }
112 | 
113 |     /// Set the seed on the booster.
114 |     /// * `seed` - Integer value used to see any randomness used in the algorithm.
115 |     pub fn set_seed(mut self, seed: u64) -> Self {
116 |         self.seed = seed;
117 |         self
118 |     }
119 | 
120 |     /// Set the quantile on the booster.
121 |     /// * `quantile` - used only in quantile regression.
122 |     pub fn set_quantile(mut self, quantile: Option<f64>) -> Self {
123 |         self.quantile = quantile;
124 |         self
125 |     }
126 | 
127 |     /// Set the reset on the booster.
128 |     /// * `reset` - Reset the model or continue training.
129 |     pub fn set_reset(mut self, reset: Option<bool>) -> Self {
130 |         self.reset = reset;
131 |         self
132 |     }
133 | 
134 |     /// Set the categorical features on the booster.
135 |     /// * `categorical_features` - categorical features.
136 |     pub fn set_categorical_features(mut self, categorical_features: Option<HashSet<usize>>) -> Self {
137 |         self.categorical_features = categorical_features;
138 |         self
139 |     }
140 | 
141 |     /// Set the timeout on the booster.
142 |     /// * `timeout` - fit timeout limit in seconds.
143 |     pub fn set_timeout(mut self, timeout: Option<f32>) -> Self {
144 |         self.timeout = timeout;
145 |         self
146 |     }
147 | 
148 |     /// Set the iteration limit on the booster.
149 |     /// * `iteration_limit` - optional limit for the number of boosting rounds.
150 |     pub fn set_iteration_limit(mut self, iteration_limit: Option<usize>) -> Self {
151 |         self.iteration_limit = iteration_limit;
152 |         self
153 |     }
154 | 
155 |     /// Set the memory limit on the booster.
156 |     /// * `memory_limit` - optional limit for memory allocation.
157 |     pub fn set_memory_limit(mut self, memory_limit: Option<f32>) -> Self {
158 |         self.memory_limit = memory_limit;
159 |         self
160 |     }
161 | 
162 |     /// Set the stopping rounds on the booster.
163 |     /// * `stopping_rounds` - optional limit for auto stopping rounds.
164 |     pub fn set_stopping_rounds(mut self, stopping_rounds: Option<usize>) -> Self {
165 |         self.stopping_rounds = stopping_rounds;
166 |         self
167 |     }
168 | }
169 | 


--------------------------------------------------------------------------------
/src/conformal/cqr.rs:
--------------------------------------------------------------------------------
  1 | use crate::{errors::PerpetualError, objective_functions::Objective, utils::percentiles, Matrix, PerpetualBooster};
  2 | use std::collections::HashMap;
  3 | 
  4 | pub type CalData<'a> = (Matrix<'a, f64>, &'a [f64], &'a [f64]); // (x_flat_data, rows, cols), y, alpha
  5 | 
  6 | impl PerpetualBooster {
  7 |     /// Calibrate models to get prediction intervals
  8 |     /// * `alpha` - Alpha list to train calibration models for
  9 |     pub fn calibrate(
 10 |         &mut self,
 11 |         data: &Matrix<f64>,
 12 |         y: &[f64],
 13 |         sample_weight: Option<&[f64]>,
 14 |         data_cal: CalData,
 15 |     ) -> Result<(), PerpetualError> {
 16 |         let (x_cal, y_cal, alpha) = data_cal;
 17 | 
 18 |         for alpha_ in alpha {
 19 |             let lower_quantile = Some(alpha_ / 2.0);
 20 |             let mut model_lower = PerpetualBooster::default()
 21 |                 .set_objective(Objective::QuantileLoss)
 22 |                 .set_quantile(lower_quantile);
 23 |             model_lower.fit(&data, &y, sample_weight)?;
 24 | 
 25 |             let upper_quantile = Some(1.0 - alpha_ / 2.0);
 26 |             let mut model_upper = PerpetualBooster::default()
 27 |                 .set_objective(Objective::QuantileLoss)
 28 |                 .set_quantile(upper_quantile);
 29 |             model_upper.fit(&data, &y, sample_weight)?;
 30 | 
 31 |             let y_cal_pred_lower = model_lower.predict(&x_cal, true);
 32 |             let y_cal_pred_upper = model_upper.predict(&x_cal, true);
 33 |             let mut scores: Vec<f64> = Vec::with_capacity(y_cal.len());
 34 |             for i in 0..y_cal.len() {
 35 |                 scores.push(f64::max(y_cal_pred_lower[i] - y_cal[i], y_cal[i] - y_cal_pred_upper[i]));
 36 |             }
 37 |             let perc = (1.0 - (*alpha_ as f64)) * (1.0 + 1.0 * (1.0 / (scores.len() as f64)));
 38 |             let score = percentiles(&scores, &vec![1.0; scores.len()], &vec![perc])[0];
 39 |             self.cal_models
 40 |                 .insert(alpha_.to_string(), [(model_lower, -score), (model_upper, score)]);
 41 |         }
 42 |         Ok(())
 43 |     }
 44 | 
 45 |     pub fn predict_intervals(&self, data: &Matrix<f64>, parallel: bool) -> HashMap<String, Vec<Vec<f64>>> {
 46 |         let mut intervals = HashMap::new();
 47 |         for (alpha, value) in &self.cal_models {
 48 |             let (model_lower, score_lower) = &value[0];
 49 |             let (model_upper, score_upper) = &value[1];
 50 |             let lower_preds = model_lower
 51 |                 .predict(data, parallel)
 52 |                 .iter()
 53 |                 .map(|p| p + score_lower)
 54 |                 .collect();
 55 |             let upper_preds = model_upper
 56 |                 .predict(data, parallel)
 57 |                 .iter()
 58 |                 .map(|p| p + score_upper)
 59 |                 .collect();
 60 |             intervals.insert(alpha.to_string(), vec![lower_preds, upper_preds]);
 61 |         }
 62 |         intervals
 63 |     }
 64 | }
 65 | 
 66 | #[cfg(test)]
 67 | mod tests {
 68 |     use super::*;
 69 |     use crate::objective_functions::Objective;
 70 |     use polars::io::SerReader;
 71 |     use polars::prelude::{CsvReadOptions, DataType};
 72 |     use std::error::Error;
 73 |     use std::sync::Arc;
 74 | 
 75 |     #[test]
 76 |     fn test_cqr() -> Result<(), Box<dyn Error>> {
 77 |         let all_names = [
 78 |             "MedInc".to_string(),
 79 |             "HouseAge".to_string(),
 80 |             "AveRooms".to_string(),
 81 |             "AveBedrms".to_string(),
 82 |             "Population".to_string(),
 83 |             "AveOccup".to_string(),
 84 |             "Latitude".to_string(),
 85 |             "Longitude".to_string(),
 86 |             "MedHouseVal".to_string(),
 87 |         ];
 88 | 
 89 |         let feature_names = [
 90 |             "MedInc".to_string(),
 91 |             "HouseAge".to_string(),
 92 |             "AveRooms".to_string(),
 93 |             "AveBedrms".to_string(),
 94 |             "Population".to_string(),
 95 |             "AveOccup".to_string(),
 96 |             "Latitude".to_string(),
 97 |             "Longitude".to_string(),
 98 |         ];
 99 | 
100 |         let column_names_train = Arc::new(all_names.clone());
101 |         let column_names_test = Arc::new(all_names.clone());
102 | 
103 |         let df_train = CsvReadOptions::default()
104 |             .with_has_header(true)
105 |             .with_columns(Some(column_names_train))
106 |             .try_into_reader_with_file_path(Some("resources/cal_housing_train.csv".into()))?
107 |             .finish()
108 |             .unwrap();
109 | 
110 |         let df_test = CsvReadOptions::default()
111 |             .with_has_header(true)
112 |             .with_columns(Some(column_names_test))
113 |             .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))?
114 |             .finish()
115 |             .unwrap();
116 | 
117 |         // Get data in column major format...
118 |         let id_vars_train: Vec<&str> = Vec::new();
119 |         let mdf_train = df_train.unpivot(feature_names.clone(), &id_vars_train)?;
120 |         let id_vars_test: Vec<&str> = Vec::new();
121 |         let mdf_test = df_test.unpivot(feature_names, &id_vars_test)?;
122 | 
123 |         let data_train = Vec::from_iter(
124 |             mdf_train
125 |                 .select_at_idx(1)
126 |                 .expect("Invalid column")
127 |                 .f64()?
128 |                 .into_iter()
129 |                 .map(|v| v.unwrap_or(f64::NAN)),
130 |         );
131 |         let data_test = Vec::from_iter(
132 |             mdf_test
133 |                 .select_at_idx(1)
134 |                 .expect("Invalid column")
135 |                 .f64()?
136 |                 .into_iter()
137 |                 .map(|v| v.unwrap_or(f64::NAN)),
138 |         );
139 | 
140 |         let y_train = Vec::from_iter(
141 |             df_train
142 |                 .column("MedHouseVal")?
143 |                 .cast(&DataType::Float64)?
144 |                 .f64()?
145 |                 .into_iter()
146 |                 .map(|v| v.unwrap_or(f64::NAN)),
147 |         );
148 |         let y_test = Vec::from_iter(
149 |             df_test
150 |                 .column("MedHouseVal")?
151 |                 .cast(&DataType::Float64)?
152 |                 .f64()?
153 |                 .into_iter()
154 |                 .map(|v| v.unwrap_or(f64::NAN)),
155 |         );
156 | 
157 |         // Create Matrix from ndarray.
158 |         let matrix_train = Matrix::new(&data_train, y_train.len(), 8);
159 |         let matrix_test = Matrix::new(&data_test, y_test.len(), 8);
160 | 
161 |         let mut model = PerpetualBooster::default()
162 |             .set_objective(Objective::SquaredLoss)
163 |             .set_max_bin(10)
164 |             .set_budget(0.1);
165 | 
166 |         model.fit(&matrix_train, &y_train, None)?;
167 | 
168 |         let alpha = vec![0.1];
169 |         let data_cal = (matrix_test, y_test.as_slice(), alpha.as_slice());
170 | 
171 |         model.calibrate(&matrix_train, &y_train, None, data_cal)?;
172 | 
173 |         let matrix_test = Matrix::new(&data_test, y_test.len(), 8);
174 |         let _intervals = model.predict_intervals(&matrix_test, true);
175 | 
176 |         Ok(())
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/src/conformal/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod cqr;
2 | 


--------------------------------------------------------------------------------
/src/constants.rs:
--------------------------------------------------------------------------------
 1 | pub const STOPPING_ROUNDS: usize = 3;
 2 | pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9;
 3 | pub const N_NODES_ALLOC_MIN: usize = 100;
 4 | pub const N_NODES_ALLOC_MAX: usize = 10000;
 5 | pub const ITER_LIMIT: usize = 1000;
 6 | pub const GENERALIZATION_THRESHOLD: f32 = 1.0;
 7 | pub const GENERALIZATION_THRESHOLD_RELAXED: f32 = 0.99;
 8 | pub const MIN_COL_AMOUNT: usize = 40;
 9 | pub const HESSIAN_EPS: f32 = 1e-3;
10 | 


--------------------------------------------------------------------------------
/src/constraints.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | use std::collections::HashMap;
 3 | 
 4 | #[derive(Debug, Deserialize, Serialize, Clone, Copy)]
 5 | pub enum Constraint {
 6 |     Positive,
 7 |     Negative,
 8 |     Unconstrained,
 9 | }
10 | 
11 | pub type ConstraintMap = HashMap<usize, Constraint>;
12 | 


--------------------------------------------------------------------------------
/src/errors.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | #[derive(Debug, Error)]
 4 | pub enum PerpetualError {
 5 |     #[error("Feature number {0} has no variance, when missing values are excluded.")]
 6 |     NoVariance(usize),
 7 |     #[error("Unable to write model to file: {0}")]
 8 |     UnableToWrite(String),
 9 |     #[error("Unable to read model from a file {0}")]
10 |     UnableToRead(String),
11 |     #[error("The value {0} is set to missing, but a NaN value was found in the data.")]
12 |     NANVAlueFound(f64),
13 |     #[error("Invalid value {0} passed for {1}, expected one of {2}.")]
14 |     ParseString(String, String, String),
15 |     /// First value is the name of the parameter, second is expected, third is what was passed.
16 |     #[error("Invalid parameter value passed for {0}, expected {1} but {2} provided.")]
17 |     InvalidParameter(String, String, String),
18 | }
19 | 


--------------------------------------------------------------------------------
/src/grower.rs:
--------------------------------------------------------------------------------
 1 | use serde::Deserialize;
 2 | use serde::Serialize;
 3 | 
 4 | use crate::node::SplittableNode;
 5 | use std::collections::BinaryHeap;
 6 | use std::collections::VecDeque;
 7 | 
 8 | pub trait Grower {
 9 |     fn add_node(&mut self, node: SplittableNode);
10 |     fn get_next_node(&mut self) -> SplittableNode;
11 |     fn is_empty(&self) -> bool;
12 | }
13 | 
14 | impl Grower for BinaryHeap<SplittableNode> {
15 |     fn add_node(&mut self, node: SplittableNode) {
16 |         self.push(node);
17 |     }
18 | 
19 |     fn get_next_node(&mut self) -> SplittableNode {
20 |         self.pop().expect("Grower should not be empty")
21 |     }
22 | 
23 |     fn is_empty(&self) -> bool {
24 |         self.is_empty()
25 |     }
26 | }
27 | 
28 | impl Grower for VecDeque<SplittableNode> {
29 |     fn add_node(&mut self, node: SplittableNode) {
30 |         self.push_front(node);
31 |     }
32 | 
33 |     fn get_next_node(&mut self) -> SplittableNode {
34 |         self.pop_back().expect("Grower should not be empty")
35 |     }
36 | 
37 |     fn is_empty(&self) -> bool {
38 |         self.is_empty()
39 |     }
40 | }
41 | 
42 | #[derive(Serialize, Deserialize)]
43 | pub enum GrowPolicy {
44 |     DepthWise,
45 |     LossGuide,
46 | }
47 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(array_ptr_get)]
 2 | 
 3 | mod node;
 4 | mod partial_dependence;
 5 | mod shapley;
 6 | 
 7 | // Modules
 8 | pub mod objective_functions;
 9 | pub mod bin;
10 | pub mod binning;
11 | pub mod booster;
12 | pub mod conformal;
13 | pub mod constants;
14 | pub mod constraints;
15 | pub mod data;
16 | pub mod errors;
17 | pub mod grower;
18 | pub mod histogram;
19 | pub mod metrics;
20 | pub mod prune;
21 | pub mod sampler;
22 | pub mod splitter;
23 | pub mod tree;
24 | pub mod utils;
25 | 
26 | // Individual classes, and functions
27 | pub use booster::booster::PerpetualBooster;
28 | pub use booster::multi_output::MultiOutputBooster;
29 | pub use data::Matrix;
30 | 


--------------------------------------------------------------------------------
/src/metrics/classification/metrics.rs:
--------------------------------------------------------------------------------
 1 | use crate::metrics::*;
 2 | 
 3 | pub struct LogLossMetric {}
 4 | impl EvaluationMetric for LogLossMetric {
 5 |     fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option<f32>) -> f64 {
 6 |         log_loss(y, yhat, sample_weight)
 7 |     }
 8 |     fn maximize() -> bool {
 9 |         false
10 |     }
11 | }
12 | 
13 | pub struct AUCMetric {}
14 | impl EvaluationMetric for AUCMetric {
15 |     fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option<f32>) -> f64 {
16 |         roc_auc_score(y, yhat, sample_weight)
17 |     }
18 |     fn maximize() -> bool {
19 |         true
20 |     }
21 | }
22 | 
23 | pub fn log_loss(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 {
24 |     let mut w_sum = 0.;
25 |     let res = y
26 |         .iter()
27 |         .zip(yhat)
28 |         .zip(sample_weight)
29 |         .map(|((y_, yhat_), w_)| {
30 |             w_sum += *w_;
31 |             let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
32 |             -(*y_ * yhat_.ln() + (f64::ONE - *y_) * ((f64::ONE - yhat_).ln())) * *w_
33 |         })
34 |         .sum::<f64>();
35 |     res / w_sum
36 | }
37 | 
38 | fn trapezoid_area(x0: f64, x1: f64, y0: f64, y1: f64) -> f64 {
39 |     (x0 - x1).abs() * (y0 + y1) * 0.5
40 | }
41 | 
42 | pub fn roc_auc_score(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 {
43 |     let mut indices = (0..y.len()).collect::<Vec<_>>();
44 |     indices.sort_unstable_by(|&a, &b| yhat[b].total_cmp(&yhat[a]));
45 |     let mut auc: f64 = 0.0;
46 | 
47 |     let mut label = y[indices[0]];
48 |     let mut w = sample_weight[indices[0]];
49 |     let mut fp = (1.0 - label) * w;
50 |     let mut tp: f64 = label * w;
51 |     let mut tp_prev: f64 = 0.0;
52 |     let mut fp_prev: f64 = 0.0;
53 | 
54 |     for i in 1..indices.len() {
55 |         if yhat[indices[i]] != yhat[indices[i - 1]] {
56 |             auc += trapezoid_area(fp_prev, fp, tp_prev, tp);
57 |             tp_prev = tp;
58 |             fp_prev = fp;
59 |         }
60 |         label = y[indices[i]];
61 |         w = sample_weight[indices[i]];
62 |         fp += (1.0 - label) * w;
63 |         tp += label * w;
64 |     }
65 | 
66 |     auc += trapezoid_area(fp_prev, fp, tp_prev, tp);
67 |     if fp <= 0.0 || tp <= 0.0 {
68 |         auc = 0.0;
69 |         fp = 0.0;
70 |         tp = 0.0;
71 |     }
72 | 
73 |     auc / (tp * fp)
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/src/metrics/classification/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod metrics;
2 | pub use metrics::*;


--------------------------------------------------------------------------------
/src/metrics/mod.rs:
--------------------------------------------------------------------------------
  1 | pub mod classification;
  2 | pub mod regression;
  3 | 
  4 | use crate::data::FloatData;
  5 | use crate::errors::PerpetualError;
  6 | use crate::utils::items_to_strings;
  7 | use serde::{Deserialize, Serialize};
  8 | use std::str::FromStr;
  9 | 
 10 | pub type MetricFn = fn(&[f64], &[f64], &[f64], Option<f32>) -> f64;
 11 | 
 12 | /// Compare to metric values, determining if b is better.
 13 | /// If one of them is NaN favor the non NaN value.
 14 | /// If both are NaN, consider the first value to be better.
 15 | pub fn is_comparison_better(value: f64, comparison: f64, maximize: bool) -> bool {
 16 |     match (value.is_nan(), comparison.is_nan()) {
 17 |         // Both nan, comparison is not better,
 18 |         // Or comparison is nan, also not better
 19 |         (true, true) | (false, true) => false,
 20 |         // comparison is not Nan, it's better
 21 |         (true, false) => true,
 22 |         // Perform numerical comparison.
 23 |         (false, false) => {
 24 |             // If we are maximizing is the comparison
 25 |             // greater, than the current value
 26 |             if maximize {
 27 |                 value < comparison
 28 |             // If we are minimizing is the comparison
 29 |             // less than the current value.
 30 |             } else {
 31 |                 value > comparison
 32 |             }
 33 |         }
 34 |     }
 35 | }
 36 | 
 37 | #[derive(Debug, Deserialize, Serialize, Clone, Copy)]
 38 | pub enum Metric {
 39 |     AUC,
 40 |     LogLoss,
 41 |     RootMeanSquaredLogError,
 42 |     RootMeanSquaredError,
 43 |     QuantileLoss,
 44 | }
 45 | 
 46 | impl FromStr for Metric {
 47 |     type Err = PerpetualError;
 48 | 
 49 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 50 |         match s {
 51 |             "AUC" => Ok(Metric::AUC),
 52 |             "LogLoss" => Ok(Metric::LogLoss),
 53 |             "RootMeanSquaredLogError" => Ok(Metric::RootMeanSquaredLogError),
 54 |             "RootMeanSquaredError" => Ok(Metric::RootMeanSquaredError),
 55 | 
 56 |             _ => Err(PerpetualError::ParseString(
 57 |                 s.to_string(),
 58 |                 "Metric".to_string(),
 59 |                 items_to_strings(vec![
 60 |                     "AUC",
 61 |                     "LogLoss",
 62 |                     "RootMeanSquaredLogError",
 63 |                     "RootMeanSquaredError",
 64 |                 ]),
 65 |             )),
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | pub fn metric_callables(metric_type: &Metric) -> (MetricFn, bool) {
 71 |     match metric_type {
 72 |         Metric::AUC => (
 73 |             classification::AUCMetric::calculate_metric,
 74 |             classification::AUCMetric::maximize(),
 75 |         ),
 76 |         Metric::LogLoss => (
 77 |             classification::LogLossMetric::calculate_metric,
 78 |             classification::LogLossMetric::maximize(),
 79 |         ),
 80 |         Metric::RootMeanSquaredLogError => (
 81 |             regression::RootMeanSquaredLogErrorMetric::calculate_metric,
 82 |             regression::RootMeanSquaredLogErrorMetric::maximize(),
 83 |         ),
 84 |         Metric::RootMeanSquaredError => (
 85 |             regression::RootMeanSquaredErrorMetric::calculate_metric,
 86 |             regression::RootMeanSquaredErrorMetric::maximize(),
 87 |         ),
 88 |         Metric::QuantileLoss => (
 89 |             regression::QuantileLossMetric::calculate_metric,
 90 |             regression::QuantileLossMetric::maximize(),
 91 |         ),
 92 |     }
 93 | }
 94 | 
 95 | pub trait EvaluationMetric {
 96 |     fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], alpha: Option<f32>) -> f64;
 97 |     fn maximize() -> bool;
 98 | }
 99 | 
100 | #[cfg(test)]
101 | mod tests {
102 |     use crate::metrics::classification::*;
103 |     use crate::metrics::regression::*;
104 |     use crate::utils::precision_round;
105 |     #[test]
106 |     fn test_root_mean_squared_log_error() {
107 |         let y = vec![1., 3., 4., 5., 2., 4., 6.];
108 |         let yhat = vec![3., 2., 3., 4., 4., 4., 4.];
109 |         let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.];
110 |         let res = root_mean_squared_log_error(&y, &yhat, &sample_weight);
111 |         assert_eq!(precision_round(res, 4), 0.3549);
112 |     }
113 |     #[test]
114 |     fn test_root_mean_squared_error() {
115 |         let y = vec![1., 3., 4., 5., 2., 4., 6.];
116 |         let yhat = vec![3., 2., 3., 4., 4., 4., 4.];
117 |         let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.];
118 |         let res = root_mean_squared_error(&y, &yhat, &sample_weight);
119 |         assert_eq!(precision_round(res, 6), 1.452966);
120 |     }
121 | 
122 |     #[test]
123 |     fn test_log_loss() {
124 |         let y = vec![1., 0., 1., 0., 0., 0., 0.];
125 |         let yhat = vec![0.5, 0.01, -0., 1.05, 0., -4., 0.];
126 |         let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.];
127 |         let res = log_loss(&y, &yhat, &sample_weight);
128 |         assert_eq!(precision_round(res, 5), 0.59235);
129 |     }
130 | 
131 |     #[test]
132 |     fn test_auc_real_data() {
133 |         let y = vec![1., 0., 1., 0., 0., 0., 0.];
134 |         let yhat = vec![0.5, 0.01, -0., 1.05, 0., -4., 0.];
135 |         let sample_weight = vec![1., 1., 1., 1., 1., 2., 2.];
136 |         let res = roc_auc_score(&y, &yhat, &sample_weight);
137 |         assert_eq!(precision_round(res, 5), 0.67857);
138 |     }
139 | 
140 |     #[test]
141 |     fn test_auc_generc() {
142 |         let sample_weight: Vec<f64> = vec![1.; 2];
143 | 
144 |         let y: Vec<f64> = vec![0., 1.];
145 |         let yhat: Vec<f64> = vec![0., 1.];
146 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
147 |         assert_eq!(auc_score, 1.);
148 | 
149 |         let y: Vec<f64> = vec![0., 1.];
150 |         let yhat: Vec<f64> = vec![1., 0.];
151 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
152 |         assert_eq!(auc_score, 0.);
153 | 
154 |         let y: Vec<f64> = vec![1., 0.];
155 |         let yhat: Vec<f64> = vec![1., 1.];
156 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
157 |         assert_eq!(auc_score, 0.5);
158 | 
159 |         let y: Vec<f64> = vec![1., 0.];
160 |         let yhat: Vec<f64> = vec![1., 0.];
161 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
162 |         assert_eq!(auc_score, 1.0);
163 | 
164 |         let y: Vec<f64> = vec![1., 0.];
165 |         let yhat: Vec<f64> = vec![0.5, 0.5];
166 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
167 |         assert_eq!(auc_score, 0.5);
168 | 
169 |         let y: Vec<f64> = vec![0., 0.];
170 |         let yhat: Vec<f64> = vec![0.25, 0.75];
171 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
172 |         assert!(auc_score.is_nan());
173 | 
174 |         let y: Vec<f64> = vec![1., 1.];
175 |         let yhat: Vec<f64> = vec![0.25, 0.75];
176 |         let auc_score = roc_auc_score(&y, &yhat, &sample_weight);
177 |         assert!(auc_score.is_nan());
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/src/metrics/regression/metrics.rs:
--------------------------------------------------------------------------------
 1 | use crate::metrics::*;
 2 | 
 3 | pub struct QuantileLossMetric {}
 4 | impl EvaluationMetric for QuantileLossMetric {
 5 |     fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], alpha: Option<f32>) -> f64 {
 6 |         quantile_loss(y, yhat, sample_weight, alpha)
 7 |     }
 8 |     fn maximize() -> bool {
 9 |         false
10 |     }
11 | }
12 | 
13 | pub struct RootMeanSquaredLogErrorMetric {}
14 | impl EvaluationMetric for RootMeanSquaredLogErrorMetric {
15 |     fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option<f32>) -> f64 {
16 |         root_mean_squared_log_error(y, yhat, sample_weight)
17 |     }
18 |     fn maximize() -> bool {
19 |         false
20 |     }
21 | }
22 | 
23 | pub struct RootMeanSquaredErrorMetric {}
24 | impl EvaluationMetric for RootMeanSquaredErrorMetric {
25 |     fn calculate_metric(y: &[f64], yhat: &[f64], sample_weight: &[f64], _alpha: Option<f32>) -> f64 {
26 |         root_mean_squared_error(y, yhat, sample_weight)
27 |     }
28 |     fn maximize() -> bool {
29 |         false
30 |     }
31 | }
32 | 
33 | pub fn quantile_loss(y: &[f64], yhat: &[f64], sample_weight: &[f64], alpha: Option<f32>) -> f64 {
34 |     let mut w_sum = 0.;
35 |     let res = y
36 |         .iter()
37 |         .zip(yhat)
38 |         .zip(sample_weight)
39 |         .map(|((y_, yhat_), w_)| {
40 |             w_sum += *w_;
41 |             let _alpha = alpha.unwrap() as f64;
42 |             let s = *y_ - *yhat_;
43 |             let l = if s >= 0.0 { _alpha * s } else { (1.0 - _alpha) * s };
44 |             l * *w_
45 |         })
46 |         .sum::<f64>();
47 |     res / w_sum
48 | }
49 | 
50 | pub fn root_mean_squared_log_error(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 {
51 |     let mut w_sum = 0.;
52 |     let res = y
53 |         .iter()
54 |         .zip(yhat)
55 |         .zip(sample_weight)
56 |         .map(|((y_, yhat_), w_)| {
57 |             w_sum += *w_;
58 |             (y_.ln_1p() - yhat_.ln_1p()).powi(2) * *w_
59 |         })
60 |         .sum::<f64>();
61 |     (res / w_sum).sqrt()
62 | }
63 | 
64 | pub fn root_mean_squared_error(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> f64 {
65 |     let mut w_sum = 0.;
66 |     let res = y
67 |         .iter()
68 |         .zip(yhat)
69 |         .zip(sample_weight)
70 |         .map(|((y_, yhat_), w_)| {
71 |             w_sum += *w_;
72 |             (y_ - yhat_).powi(2) * *w_
73 |         })
74 |         .sum::<f64>();
75 |     (res / w_sum).sqrt()
76 | }


--------------------------------------------------------------------------------
/src/metrics/regression/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod metrics;
2 | pub use metrics::*;


--------------------------------------------------------------------------------
/src/objective_functions/adaptive_huber_loss.rs:
--------------------------------------------------------------------------------
  1 | use super::ObjectiveFunction;
  2 | use crate::metrics::Metric;
  3 | 
  4 | /// Adaptive Huber Loss
  5 | #[derive(Default)]
  6 | pub struct AdaptiveHuberLoss {}
  7 | impl ObjectiveFunction for AdaptiveHuberLoss {
  8 |     fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, quantile: Option<f64>) -> Vec<f32> {
  9 |         // default alpha: 0.5
 10 |         // if not passed explicitly
 11 |         let alpha = quantile.unwrap_or(0.5);
 12 |         let n = y.len();
 13 | 
 14 |         let mut abs_res = y
 15 |             .iter()
 16 |             .zip(yhat.iter())
 17 |             .map(|(&yi, &yh)| (yi - yh).abs())
 18 |             .collect::<Vec<_>>();
 19 |         abs_res.sort_by(|a, b| a.partial_cmp(b).unwrap());
 20 | 
 21 |         let idx = ((n as f64) * alpha).floor() as usize;
 22 |         let delta = abs_res[idx.min(n - 1)];
 23 | 
 24 |         match sample_weight {
 25 |             Some(weights) => y
 26 |                 .iter()
 27 |                 .zip(yhat.iter())
 28 |                 .enumerate()
 29 |                 .map(|(i, (&yi, &yh))| {
 30 |                     let r = yi - yh;
 31 |                     let ar = r.abs();
 32 |                     let base = if ar <= delta {
 33 |                         0.5 * r * r
 34 |                     } else {
 35 |                         delta * (ar - 0.5 * delta)
 36 |                     };
 37 |                     (base * weights[i]) as f32
 38 |                 })
 39 |                 .collect(),
 40 |             None => y
 41 |                 .iter()
 42 |                 .zip(yhat.iter())
 43 |                 .map(|(&yi, &yh)| {
 44 |                     let r = yi - yh;
 45 |                     let ar = r.abs();
 46 |                     let loss = if ar <= delta {
 47 |                         0.5 * r * r
 48 |                     } else {
 49 |                         delta * (ar - 0.5 * delta)
 50 |                     };
 51 |                     loss as f32
 52 |                 })
 53 |                 .collect(),
 54 |         }
 55 |     }
 56 | 
 57 |     fn calc_grad_hess(
 58 |         y: &[f64],
 59 |         yhat: &[f64],
 60 |         sample_weight: Option<&[f64]>,
 61 |         quantile: Option<f64>,
 62 |     ) -> (Vec<f32>, Option<Vec<f32>>) {
 63 |         // default alpha: 0.5
 64 |         // if not passed explicitly
 65 |         let alpha = quantile.unwrap_or(0.5);
 66 |         let n = y.len();
 67 | 
 68 |         let mut abs_res = y
 69 |             .iter()
 70 |             .zip(yhat.iter())
 71 |             .map(|(&yi, &yh)| (yi - yh).abs())
 72 |             .collect::<Vec<_>>();
 73 |         abs_res.sort_by(|a, b| a.partial_cmp(b).unwrap());
 74 |         let idx = ((n as f64) * alpha).floor() as usize;
 75 |         let delta = abs_res[idx.min(n - 1)];
 76 | 
 77 |         match sample_weight {
 78 |             Some(weights) => {
 79 |                 let (grad, hess): (Vec<f32>, Vec<f32>) = y
 80 |                     .iter()
 81 |                     .zip(yhat.iter())
 82 |                     .enumerate()
 83 |                     .map(|(i, (&yi, &yh))| {
 84 |                         let r = yi - yh;
 85 |                         let ar = r.abs();
 86 |                         let sign = (yh - yi).signum();
 87 |                         let g = if ar <= delta {
 88 |                             (yh - yi) * weights[i]
 89 |                         } else {
 90 |                             delta * sign * weights[i]
 91 |                         };
 92 |                         let h = if ar <= delta { weights[i] } else { 0.0 };
 93 |                         (g as f32, h as f32)
 94 |                     })
 95 |                     .unzip();
 96 |                 (grad, Some(hess))
 97 |             }
 98 |             None => {
 99 |                 let (grad, hess): (Vec<f32>, Vec<f32>) = y
100 |                     .iter()
101 |                     .zip(yhat.iter())
102 |                     .map(|(&yi, &yh)| {
103 |                         let r = yi - yh;
104 |                         let ar = r.abs();
105 |                         let sign = (yh - yi).signum();
106 |                         let g = if ar <= delta { yh - yi } else { delta * sign };
107 |                         let h = if ar <= delta { 1.0 } else { 0.0 };
108 |                         (g as f32, h as f32)
109 |                     })
110 |                     .unzip();
111 |                 (grad, Some(hess))
112 |             }
113 |         }
114 |     }
115 | 
116 |     fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option<f64>) -> f64 {
117 |         let mut idxs = (0..y.len()).collect::<Vec<_>>();
118 |         idxs.sort_by(|&i, &j| y[i].partial_cmp(&y[j]).unwrap());
119 | 
120 |         let total_w = sample_weight.map(|w| w.iter().sum::<f64>()).unwrap_or(y.len() as f64);
121 |         let target = total_w * 0.5;
122 | 
123 |         // find weighted median via scan()
124 |         let median = idxs
125 |             .iter()
126 |             .scan(0.0, |cum, &i| {
127 |                 *cum += sample_weight.map_or(1.0, |w| w[i]);
128 |                 Some((i, *cum))
129 |             })
130 |             .find(|&(_i, cum)| cum >= target)
131 |             .map(|(i, _)| y[i])
132 |             .unwrap_or(y[idxs[y.len() / 2]]);
133 | 
134 |         median
135 |     }
136 | 
137 |     fn default_metric() -> Metric {
138 |         Metric::RootMeanSquaredError
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/objective_functions/huber_loss.rs:
--------------------------------------------------------------------------------
  1 | use super::ObjectiveFunction;
  2 | use crate::metrics::Metric;
  3 | 
  4 | /// Huber Loss
  5 | #[derive(Default)]
  6 | pub struct HuberLoss {}
  7 | impl ObjectiveFunction for HuberLoss {
  8 |     fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, delta: Option<f64>) -> Vec<f32> {
  9 |         // Default delta value
 10 |         let delta = delta.unwrap_or(1.0);
 11 |         match sample_weight {
 12 |             Some(weights) => y
 13 |                 .iter()
 14 |                 .zip(yhat.iter())
 15 |                 .enumerate()
 16 |                 .map(|(i, (&yi, &yh))| {
 17 |                     let r = yi - yh;
 18 |                     let ar = r.abs();
 19 |                     let base = if ar <= delta {
 20 |                         0.5 * r * r
 21 |                     } else {
 22 |                         delta * (ar - 0.5 * delta)
 23 |                     };
 24 |                     (base * weights[i]) as f32
 25 |                 })
 26 |                 .collect(),
 27 |             None => y
 28 |                 .iter()
 29 |                 .zip(yhat.iter())
 30 |                 .map(|(&yi, &yh)| {
 31 |                     let r = yi - yh;
 32 |                     let ar = r.abs();
 33 |                     let loss = if ar <= delta {
 34 |                         0.5 * r * r
 35 |                     } else {
 36 |                         delta * (ar - 0.5 * delta)
 37 |                     };
 38 |                     loss as f32
 39 |                 })
 40 |                 .collect(),
 41 |         }
 42 |     }
 43 | 
 44 |     fn calc_grad_hess(
 45 |         y: &[f64],
 46 |         yhat: &[f64],
 47 |         sample_weight: Option<&[f64]>,
 48 |         delta: Option<f64>,
 49 |     ) -> (Vec<f32>, Option<Vec<f32>>) {
 50 |         // default delta value
 51 |         let delta = delta.unwrap_or(1.0);
 52 | 
 53 |         match sample_weight {
 54 |             Some(weights) => {
 55 |                 let (grad, hess): (Vec<f32>, Vec<f32>) = y
 56 |                     .iter()
 57 |                     .zip(yhat.iter())
 58 |                     .enumerate()
 59 |                     .map(|(i, (&yi, &yh))| {
 60 |                         let r = yi - yh;
 61 |                         let ar = r.abs();
 62 |                         let sign = (yh - yi).signum();
 63 |                         let g = if ar <= delta {
 64 |                             (yh - yi) * weights[i]
 65 |                         } else {
 66 |                             delta * sign * weights[i]
 67 |                         };
 68 |                         let h = if ar <= delta { weights[i] } else { 0.0 };
 69 |                         (g as f32, h as f32)
 70 |                     })
 71 |                     .unzip();
 72 |                 (grad, Some(hess))
 73 |             }
 74 |             None => {
 75 |                 let (grad, hess): (Vec<f32>, Vec<f32>) = y
 76 |                     .iter()
 77 |                     .zip(yhat.iter())
 78 |                     .map(|(&yi, &yh)| {
 79 |                         let r = yi - yh;
 80 |                         let ar = r.abs();
 81 |                         let sign = (yh - yi).signum();
 82 |                         let g = if ar <= delta { yh - yi } else { delta * sign };
 83 |                         let h = if ar <= delta { 1.0 } else { 0.0 };
 84 |                         (g as f32, h as f32)
 85 |                     })
 86 |                     .unzip();
 87 |                 (grad, Some(hess))
 88 |             }
 89 |         }
 90 |     }
 91 | 
 92 |     fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option<f64>) -> f64 {
 93 |         let mut idxs = (0..y.len()).collect::<Vec<_>>();
 94 |         idxs.sort_by(|&i, &j| y[i].partial_cmp(&y[j]).unwrap());
 95 | 
 96 |         let total_w = sample_weight.map(|w| w.iter().sum::<f64>()).unwrap_or(y.len() as f64);
 97 |         let target = total_w * 0.5;
 98 | 
 99 |         let median = idxs
100 |             .iter()
101 |             .scan(0.0, |cum, &i| {
102 |                 *cum += sample_weight.map_or(1.0, |w| w[i]);
103 |                 Some((i, *cum))
104 |             })
105 |             .find(|&(_i, cum)| cum >= target)
106 |             .map(|(i, _)| y[i])
107 |             .unwrap_or(y[idxs[y.len() / 2]]);
108 | 
109 |         median
110 |     }
111 | 
112 |     fn default_metric() -> Metric {
113 |         Metric::RootMeanSquaredError
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/objective_functions/log_loss.rs:
--------------------------------------------------------------------------------
 1 | use super::ObjectiveFunction;
 2 | use crate::{data::FloatData, metrics::Metric, utils::fast_sum};
 3 | 
 4 | #[derive(Default)]
 5 | pub struct LogLoss {}
 6 | 
 7 | impl ObjectiveFunction for LogLoss {
 8 |     #[inline]
 9 |     fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, _quantile: Option<f64>) -> Vec<f32> {
10 |         match sample_weight {
11 |             Some(sample_weight) => y
12 |                 .iter()
13 |                 .zip(yhat)
14 |                 .zip(sample_weight)
15 |                 .map(|((y_, yhat_), w_)| {
16 |                     let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
17 |                     (-(*y_ * yhat_.ln() + (f64::ONE - *y_) * ((f64::ONE - yhat_).ln())) * *w_) as f32
18 |                 })
19 |                 .collect(),
20 |             None => y
21 |                 .iter()
22 |                 .zip(yhat)
23 |                 .map(|(y_, yhat_)| {
24 |                     let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
25 |                     (-(*y_ * yhat_.ln() + (f64::ONE - *y_) * ((f64::ONE - yhat_).ln()))) as f32
26 |                 })
27 |                 .collect(),
28 |         }
29 |     }
30 | 
31 |     fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option<f64>) -> f64 {
32 |         match sample_weight {
33 |             Some(sample_weight) => {
34 |                 let mut ytot: f64 = 0.;
35 |                 let mut ntot: f64 = 0.;
36 |                 for i in 0..y.len() {
37 |                     ytot += sample_weight[i] * y[i];
38 |                     ntot += sample_weight[i];
39 |                 }
40 |                 f64::ln(ytot / (ntot - ytot))
41 |             }
42 |             None => {
43 |                 let ytot = fast_sum(y);
44 |                 let ntot = y.len() as f64;
45 |                 f64::ln(ytot / (ntot - ytot))
46 |             }
47 |         }
48 |     }
49 | 
50 |     #[inline]
51 |     fn calc_grad_hess(
52 |         y: &[f64],
53 |         yhat: &[f64],
54 |         sample_weight: Option<&[f64]>,
55 |         _quantile: Option<f64>,
56 |     ) -> (Vec<f32>, Option<Vec<f32>>) {
57 |         match sample_weight {
58 |             Some(sample_weight) => {
59 |                 let (g, h) = y
60 |                     .iter()
61 |                     .zip(yhat)
62 |                     .zip(sample_weight)
63 |                     .map(|((y_, yhat_), w_)| {
64 |                         let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
65 |                         (((yhat_ - *y_) * *w_) as f32, (yhat_ * (f64::ONE - yhat_) * *w_) as f32)
66 |                     })
67 |                     .unzip();
68 |                 (g, Some(h))
69 |             }
70 |             None => {
71 |                 let (g, h) = y
72 |                     .iter()
73 |                     .zip(yhat)
74 |                     .map(|(y_, yhat_)| {
75 |                         let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
76 |                         ((yhat_ - *y_) as f32, (yhat_ * (f64::ONE - yhat_)) as f32)
77 |                     })
78 |                     .unzip();
79 |                 (g, Some(h))
80 |             }
81 |         }
82 |     }
83 | 
84 |     fn default_metric() -> Metric {
85 |         Metric::LogLoss
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/objective_functions/mod.rs:
--------------------------------------------------------------------------------
  1 | mod adaptive_huber_loss;
  2 | mod huber_loss;
  3 | mod log_loss;
  4 | mod quantile_loss;
  5 | mod squared_loss;
  6 | 
  7 | pub use adaptive_huber_loss::AdaptiveHuberLoss;
  8 | pub use huber_loss::HuberLoss;
  9 | pub use log_loss::LogLoss;
 10 | pub use quantile_loss::QuantileLoss;
 11 | pub use squared_loss::SquaredLoss;
 12 | 
 13 | use crate::metrics::Metric;
 14 | use serde::{Deserialize, Serialize};
 15 | 
 16 | type ObjFn = fn(&[f64], &[f64], Option<&[f64]>, Option<f64>) -> (Vec<f32>, Option<Vec<f32>>);
 17 | type LossFn = fn(&[f64], &[f64], Option<&[f64]>, Option<f64>) -> Vec<f32>;
 18 | 
 19 | #[derive(Debug, Deserialize, Serialize, Clone)]
 20 | pub enum Objective {
 21 |     LogLoss,
 22 |     SquaredLoss,
 23 |     QuantileLoss,
 24 |     AdaptiveHuberLoss,
 25 |     HuberLoss,
 26 | }
 27 | 
 28 | pub fn loss_callables(objective: &Objective) -> LossFn {
 29 |     match objective {
 30 |         Objective::LogLoss => LogLoss::calc_loss,
 31 |         Objective::SquaredLoss => SquaredLoss::calc_loss,
 32 |         Objective::QuantileLoss => QuantileLoss::calc_loss,
 33 |         Objective::AdaptiveHuberLoss => AdaptiveHuberLoss::calc_loss,
 34 |         Objective::HuberLoss => HuberLoss::calc_loss,
 35 |     }
 36 | }
 37 | 
 38 | pub fn gradient_hessian_callables(objective: &Objective) -> ObjFn {
 39 |     match objective {
 40 |         Objective::LogLoss => LogLoss::calc_grad_hess,
 41 |         Objective::SquaredLoss => SquaredLoss::calc_grad_hess,
 42 |         Objective::QuantileLoss => QuantileLoss::calc_grad_hess,
 43 |         Objective::AdaptiveHuberLoss => AdaptiveHuberLoss::calc_grad_hess,
 44 |         Objective::HuberLoss => HuberLoss::calc_grad_hess,
 45 |     }
 46 | }
 47 | 
 48 | pub fn calc_init_callables(objective: &Objective) -> fn(&[f64], Option<&[f64]>, Option<f64>) -> f64 {
 49 |     match objective {
 50 |         Objective::LogLoss => LogLoss::calc_init,
 51 |         Objective::SquaredLoss => SquaredLoss::calc_init,
 52 |         Objective::QuantileLoss => QuantileLoss::calc_init,
 53 |         Objective::AdaptiveHuberLoss => AdaptiveHuberLoss::calc_init,
 54 |         Objective::HuberLoss => HuberLoss::calc_init,
 55 |     }
 56 | }
 57 | 
 58 | pub trait ObjectiveFunction {
 59 |     fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, quantile: Option<f64>) -> Vec<f32>;
 60 |     fn calc_grad_hess(
 61 |         y: &[f64],
 62 |         yhat: &[f64],
 63 |         sample_weight: Option<&[f64]>,
 64 |         quantile: Option<f64>,
 65 |     ) -> (Vec<f32>, Option<Vec<f32>>);
 66 |     fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, quantile: Option<f64>) -> f64;
 67 |     fn default_metric() -> Metric;
 68 | }
 69 | 
 70 | #[cfg(test)]
 71 | mod tests {
 72 |     use super::*;
 73 |     #[test]
 74 |     fn test_logloss_loss() {
 75 |         let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
 76 |         let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0];
 77 |         let l1 = LogLoss::calc_loss(&y, &yhat1, None, None);
 78 |         let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0];
 79 |         let l2 = LogLoss::calc_loss(&y, &yhat2, None, None);
 80 |         assert!(l1.iter().sum::<f32>() < l2.iter().sum::<f32>());
 81 |     }
 82 | 
 83 |     #[test]
 84 |     fn test_logloss_grad() {
 85 |         let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
 86 |         let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0];
 87 |         let (g1, _) = LogLoss::calc_grad_hess(&y, &yhat1, None, None);
 88 |         let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0];
 89 |         let (g2, _) = LogLoss::calc_grad_hess(&y, &yhat2, None, None);
 90 |         assert!(g1.iter().sum::<f32>() < g2.iter().sum::<f32>());
 91 |     }
 92 | 
 93 |     #[test]
 94 |     fn test_logloss_init() {
 95 |         let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
 96 |         let l1 = LogLoss::calc_init(&y, None, None);
 97 |         assert!(l1 == 0.);
 98 | 
 99 |         let y = vec![1.0; 6];
100 |         let l2 = LogLoss::calc_init(&y, None, None);
101 |         assert!(l2 == f64::INFINITY);
102 | 
103 |         let y = vec![0.0; 6];
104 |         let l3 = LogLoss::calc_init(&y, None, None);
105 |         assert!(l3 == f64::NEG_INFINITY);
106 | 
107 |         let y = vec![0., 0., 0., 0., 1., 1.];
108 |         let l4 = LogLoss::calc_init(&y, None, None);
109 |         assert!(l4 == f64::ln(2. / 4.));
110 |     }
111 | 
112 |     #[test]
113 |     fn test_mse_init() {
114 |         let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
115 |         let l1 = SquaredLoss::calc_init(&y, None, None);
116 |         assert!(l1 == 0.5);
117 | 
118 |         let y = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
119 |         let l2 = SquaredLoss::calc_init(&y, None, None);
120 |         assert!(l2 == 1.);
121 | 
122 |         let y = vec![-1.0, -1.0, -1.0, -1.0, -1.0, -1.0];
123 |         let l3 = SquaredLoss::calc_init(&y, None, None);
124 |         assert!(l3 == -1.);
125 | 
126 |         let y = vec![-1.0, -1.0, -1.0, 1., 1., 1.];
127 |         let l4 = SquaredLoss::calc_init(&y, None, None);
128 |         assert!(l4 == 0.);
129 |     }
130 | 
131 |     #[test]
132 |     fn test_quantile_init() {
133 |         let y = vec![1.0, 2.0, 9.0, 3.2, 4.0];
134 |         let w = vec![0.0, 0.5, 1.0, 0.3, 0.5];
135 |         let l1 = QuantileLoss::calc_init(&y, Some(&w), Some(0.1));
136 |         println!("{}", l1);
137 |         assert!(l1 == 2.0);
138 | 
139 |         let y = vec![1.0, 2.0, 9.0, 3.2, 4.0];
140 |         let w = vec![0.0, 0.5, 1.0, 0.3, 0.5];
141 |         let l2 = QuantileLoss::calc_init(&y, Some(&w), Some(0.9));
142 |         println!("{}", l2);
143 |         assert!(l2 == 9.0);
144 |     }
145 | 
146 |     #[test]
147 |     fn test_adaptive_huberloss_loss() {
148 |         let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
149 |         let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0];
150 |         let l1 = AdaptiveHuberLoss::calc_loss(&y, &yhat1, None, Some(0.5));
151 |         let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0];
152 |         let l2 = AdaptiveHuberLoss::calc_loss(&y, &yhat2, None, Some(0.5));
153 |         assert!(l1.iter().sum::<f32>() > l2.iter().sum::<f32>());
154 |     }
155 | 
156 |     #[test]
157 |     fn test_adaptive_huberloss_grad() {
158 |         let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
159 |         let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0];
160 |         let (g1, _) = AdaptiveHuberLoss::calc_grad_hess(&y, &yhat1, None, Some(0.5));
161 |         let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0];
162 |         let (g2, _) = AdaptiveHuberLoss::calc_grad_hess(&y, &yhat2, None, Some(0.5));
163 |         assert!(g1.iter().sum::<f32>() < g2.iter().sum::<f32>());
164 |     }
165 | }
166 | 


--------------------------------------------------------------------------------
/src/objective_functions/quantile_loss.rs:
--------------------------------------------------------------------------------
  1 | use super::ObjectiveFunction;
  2 | use crate::metrics::Metric;
  3 | 
  4 | #[derive(Default)]
  5 | pub struct QuantileLoss {}
  6 | 
  7 | impl ObjectiveFunction for QuantileLoss {
  8 |     #[inline]
  9 |     fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, quantile: Option<f64>) -> Vec<f32> {
 10 |         match sample_weight {
 11 |             Some(sample_weight) => y
 12 |                 .iter()
 13 |                 .zip(yhat)
 14 |                 .zip(sample_weight)
 15 |                 .map(|((y_, yhat_), w_)| {
 16 |                     let _quantile = quantile.unwrap();
 17 |                     let s = *y_ - *yhat_;
 18 |                     let l = if s >= 0.0 { _quantile * s } else { (_quantile - 1.0) * s };
 19 |                     (l * *w_) as f32
 20 |                 })
 21 |                 .collect(),
 22 |             None => y
 23 |                 .iter()
 24 |                 .zip(yhat)
 25 |                 .map(|(y_, yhat_)| {
 26 |                     let _quantile = quantile.unwrap();
 27 |                     let s = *y_ - *yhat_;
 28 |                     let l = if s >= 0.0 { _quantile * s } else { (_quantile - 1.0) * s };
 29 |                     l as f32
 30 |                 })
 31 |                 .collect(),
 32 |         }
 33 |     }
 34 | 
 35 |     fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, quantile: Option<f64>) -> f64 {
 36 |         match sample_weight {
 37 |             Some(sample_weight) => {
 38 |                 let mut indices = (0..y.len()).collect::<Vec<_>>();
 39 |                 indices.sort_by(|&a, &b| y[a].total_cmp(&y[b]));
 40 |                 let w_tot: f64 = sample_weight.iter().sum();
 41 |                 let w_target = w_tot * quantile.unwrap() as f64;
 42 |                 let mut w_cum = 0.0_f64;
 43 |                 let mut init_value = f64::NAN;
 44 |                 for i in indices {
 45 |                     w_cum += sample_weight[i];
 46 |                     if w_cum >= w_target {
 47 |                         init_value = y[i];
 48 |                         break;
 49 |                     }
 50 |                 }
 51 |                 init_value
 52 |             }
 53 |             None => {
 54 |                 let mut indices = (0..y.len()).collect::<Vec<_>>();
 55 |                 indices.sort_by(|&a, &b| y[a].total_cmp(&y[b]));
 56 |                 let w_tot: f64 = y.len() as f64;
 57 |                 let w_target = w_tot * quantile.unwrap() as f64;
 58 |                 let mut w_cum = 0.0_f64;
 59 |                 let mut init_value = f64::NAN;
 60 |                 for i in indices {
 61 |                     w_cum += 1.0;
 62 |                     if w_cum >= w_target {
 63 |                         init_value = y[i];
 64 |                         break;
 65 |                     }
 66 |                 }
 67 |                 init_value
 68 |             }
 69 |         }
 70 |     }
 71 | 
 72 |     #[inline]
 73 |     fn calc_grad_hess(
 74 |         y: &[f64],
 75 |         yhat: &[f64],
 76 |         sample_weight: Option<&[f64]>,
 77 |         quantile: Option<f64>,
 78 |     ) -> (Vec<f32>, Option<Vec<f32>>) {
 79 |         match sample_weight {
 80 |             Some(sample_weight) => {
 81 |                 let (g, h) = y
 82 |                     .iter()
 83 |                     .zip(yhat)
 84 |                     .zip(sample_weight)
 85 |                     .map(|((y_, yhat_), w_)| {
 86 |                         let _quantile = quantile.unwrap();
 87 |                         let delta = yhat_ - *y_;
 88 |                         let g = if delta >= 0.0 {
 89 |                             (1.0 - _quantile) * w_
 90 |                         } else {
 91 |                             -1.0 * _quantile * w_
 92 |                         };
 93 |                         (g as f32, *w_ as f32)
 94 |                     })
 95 |                     .unzip();
 96 |                 (g, Some(h))
 97 |             }
 98 |             None => {
 99 |                 let g = y
100 |                     .iter()
101 |                     .zip(yhat)
102 |                     .map(|(y_, yhat_)| {
103 |                         let _quantile = quantile.unwrap();
104 |                         let delta = yhat_ - *y_;
105 |                         let g = if delta >= 0.0 {
106 |                             1.0 - _quantile
107 |                         } else {
108 |                             -1.0 * _quantile
109 |                         };
110 |                         g as f32
111 |                     })
112 |                     .collect();
113 |                 (g, None)
114 |             }
115 |         }
116 |     }
117 | 
118 |     fn default_metric() -> Metric {
119 |         Metric::QuantileLoss
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/objective_functions/squared_loss.rs:
--------------------------------------------------------------------------------
 1 | use super::ObjectiveFunction;
 2 | use crate::{metrics::Metric, utils::fast_sum};
 3 | 
 4 | #[derive(Default)]
 5 | pub struct SquaredLoss {}
 6 | 
 7 | impl ObjectiveFunction for SquaredLoss {
 8 |     #[inline]
 9 |     fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: Option<&[f64]>, _quantile: Option<f64>) -> Vec<f32> {
10 |         match sample_weight {
11 |             Some(sample_weight) => y
12 |                 .iter()
13 |                 .zip(yhat)
14 |                 .zip(sample_weight)
15 |                 .map(|((y_, yhat_), w_)| {
16 |                     let s = *y_ - *yhat_;
17 |                     (s * s * *w_) as f32
18 |                 })
19 |                 .collect(),
20 |             None => y
21 |                 .iter()
22 |                 .zip(yhat)
23 |                 .map(|(y_, yhat_)| {
24 |                     let s = *y_ - *yhat_;
25 |                     (s * s) as f32
26 |                 })
27 |                 .collect(),
28 |         }
29 |     }
30 | 
31 |     fn calc_init(y: &[f64], sample_weight: Option<&[f64]>, _quantile: Option<f64>) -> f64 {
32 |         match sample_weight {
33 |             Some(sample_weight) => {
34 |                 let mut ytot: f64 = 0.;
35 |                 let mut ntot: f64 = 0.;
36 |                 for i in 0..y.len() {
37 |                     ytot += sample_weight[i] * y[i];
38 |                     ntot += sample_weight[i];
39 |                 }
40 |                 ytot / ntot
41 |             }
42 |             None => fast_sum(y) / y.len() as f64,
43 |         }
44 |     }
45 | 
46 |     #[inline]
47 |     fn calc_grad_hess(
48 |         y: &[f64],
49 |         yhat: &[f64],
50 |         sample_weight: Option<&[f64]>,
51 |         _quantile: Option<f64>,
52 |     ) -> (Vec<f32>, Option<Vec<f32>>) {
53 |         match sample_weight {
54 |             Some(sample_weight) => {
55 |                 let (g, h) = y
56 |                     .iter()
57 |                     .zip(yhat)
58 |                     .zip(sample_weight)
59 |                     .map(|((y_, yhat_), w_)| (((yhat_ - *y_) * *w_) as f32, *w_ as f32))
60 |                     .unzip();
61 |                 (g, Some(h))
62 |             }
63 |             None => (
64 |                 y.iter().zip(yhat).map(|(y_, yhat_)| (yhat_ - *y_) as f32).collect(),
65 |                 None,
66 |             ),
67 |         }
68 |     }
69 | 
70 |     fn default_metric() -> Metric {
71 |         Metric::RootMeanSquaredLogError
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/partial_dependence.rs:
--------------------------------------------------------------------------------
  1 | use crate::{tree::tree::Tree, utils::is_missing};
  2 | 
  3 | /// Partial Dependence Calculator
  4 | // struct PDCalculator {
  5 | //     partial_dependence: f32,
  6 | //     base_score: f64,
  7 | //     tree_prediction: f64,
  8 | 
  9 | // }
 10 | 
 11 | fn get_node_cover(tree: &Tree, node_idx: usize) -> f32 {
 12 |     tree.nodes[&node_idx].hessian_sum
 13 | }
 14 | 
 15 | pub fn tree_partial_dependence(
 16 |     tree: &Tree,
 17 |     node_idx: usize,
 18 |     feature: usize,
 19 |     value: f64,
 20 |     proportion: f32,
 21 |     missing: &f64,
 22 | ) -> f64 {
 23 |     let n = &tree.nodes[&node_idx];
 24 |     if n.is_leaf {
 25 |         f64::from(proportion * n.weight_value)
 26 |     } else if n.split_feature == feature {
 27 |         let child = if is_missing(&value, missing) {
 28 |             n.missing_node
 29 |         } else if value < n.split_value {
 30 |             n.left_child
 31 |         } else {
 32 |             n.right_child
 33 |         };
 34 |         tree_partial_dependence(tree, child, feature, value, proportion, missing)
 35 |     } else {
 36 |         let left_cover = get_node_cover(tree, n.left_child);
 37 |         let right_cover = get_node_cover(tree, n.right_child);
 38 |         let missing_cover = if n.has_missing_branch() {
 39 |             get_node_cover(tree, n.missing_node)
 40 |         } else {
 41 |             0.0
 42 |         };
 43 |         let total_cover = left_cover + right_cover + missing_cover;
 44 |         let missing_pd = if n.has_missing_branch() {
 45 |             tree_partial_dependence(
 46 |                 tree,
 47 |                 n.missing_node,
 48 |                 feature,
 49 |                 value,
 50 |                 proportion * (missing_cover / total_cover),
 51 |                 missing,
 52 |             )
 53 |         } else {
 54 |             0.
 55 |         };
 56 |         tree_partial_dependence(
 57 |             tree,
 58 |             n.left_child,
 59 |             feature,
 60 |             value,
 61 |             proportion * (left_cover / total_cover),
 62 |             missing,
 63 |         ) + tree_partial_dependence(
 64 |             tree,
 65 |             n.right_child,
 66 |             feature,
 67 |             value,
 68 |             proportion * (right_cover / total_cover),
 69 |             missing,
 70 |         ) + missing_pd
 71 |     }
 72 | }
 73 | 
 74 | #[cfg(test)]
 75 | mod tests {
 76 | 
 77 |     use super::*;
 78 |     use crate::binning::bin_matrix;
 79 |     use crate::constraints::ConstraintMap;
 80 |     use crate::data::Matrix;
 81 |     use crate::histogram::{NodeHistogram, NodeHistogramOwned};
 82 |     use crate::objective_functions::{LogLoss, ObjectiveFunction};
 83 |     use crate::splitter::{MissingImputerSplitter, SplitInfo, SplitInfoSlice};
 84 |     use crate::tree::tree::Tree;
 85 |     use std::fs;
 86 | 
 87 |     #[test]
 88 |     fn test_partial_dependence() {
 89 |         let is_const_hess = false;
 90 | 
 91 |         let file =
 92 |             fs::read_to_string("resources/contiguous_no_missing.csv").expect("Something went wrong reading the file");
 93 |         let data_vec: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
 94 |         let file = fs::read_to_string("resources/performance.csv").expect("Something went wrong reading the file");
 95 |         let y: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
 96 |         let yhat = vec![0.5; y.len()];
 97 |         let (mut g, mut h) = LogLoss::calc_grad_hess(&y, &yhat, None, None);
 98 |         let loss = LogLoss::calc_loss(&y, &yhat, None, None);
 99 | 
100 |         let data = Matrix::new(&data_vec, 891, 5);
101 |         let splitter = MissingImputerSplitter::new(0.3, true, ConstraintMap::new());
102 |         let mut tree = Tree::new();
103 | 
104 |         let b = bin_matrix(&data, None, 300, f64::NAN, None).unwrap();
105 |         let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
106 |         let col_index: Vec<usize> = (0..data.cols).collect();
107 | 
108 |         let n_nodes_alloc = 100;
109 | 
110 |         let mut hist_tree_owned: Vec<NodeHistogramOwned> = (0..n_nodes_alloc)
111 |             .map(|_| NodeHistogramOwned::empty_from_cuts(&b.cuts, &col_index, is_const_hess, true))
112 |             .collect();
113 | 
114 |         let mut hist_tree: Vec<NodeHistogram> = hist_tree_owned
115 |             .iter_mut()
116 |             .map(|node_hist| NodeHistogram::from_owned(node_hist))
117 |             .collect();
118 | 
119 |         let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap();
120 | 
121 |         let mut split_info_vec: Vec<SplitInfo> = (0..col_index.len()).map(|_| SplitInfo::default()).collect();
122 |         let split_info_slice = SplitInfoSlice::new(&mut split_info_vec);
123 | 
124 |         tree.fit(
125 |             &bdata,
126 |             data.index.to_owned(),
127 |             &col_index,
128 |             &mut g,
129 |             h.as_deref_mut(),
130 |             &splitter,
131 |             &pool,
132 |             Some(f32::MAX),
133 |             &loss,
134 |             &y,
135 |             LogLoss::calc_loss,
136 |             &yhat,
137 |             None,
138 |             None,
139 |             false,
140 |             &mut hist_tree,
141 |             None,
142 |             &split_info_slice,
143 |             n_nodes_alloc,
144 |         );
145 | 
146 |         let pdp1 = tree_partial_dependence(&tree, 0, 0, 1.0, 1.0, &f64::NAN);
147 |         let pdp2 = tree_partial_dependence(&tree, 0, 0, 2.0, 1.0, &f64::NAN);
148 |         let pdp3 = tree_partial_dependence(&tree, 0, 0, 3.0, 1.0, &f64::NAN);
149 |         println!("{}, {}, {}", pdp1, pdp2, pdp3);
150 |     }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/sampler.rs:
--------------------------------------------------------------------------------
 1 | use rand::rngs::StdRng;
 2 | use rand::Rng;
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | #[derive(Serialize, Deserialize)]
 6 | pub enum SampleMethod {
 7 |     None,
 8 |     Random,
 9 | }
10 | 
11 | // A sampler can be used to subset the data prior to fitting a new tree.
12 | pub trait Sampler {
13 |     /// Sample the data, returning a tuple, where the first item is the samples
14 |     /// chosen for training, and the second are the samples excluded.
15 |     fn sample(&mut self, rng: &mut StdRng, index: &[usize]) -> (Vec<usize>, Vec<usize>);
16 | }
17 | 
18 | pub struct RandomSampler {
19 |     subsample: f32,
20 | }
21 | 
22 | impl RandomSampler {
23 |     #[allow(dead_code)]
24 |     pub fn new(subsample: f32) -> Self {
25 |         RandomSampler { subsample }
26 |     }
27 | }
28 | 
29 | impl Sampler for RandomSampler {
30 |     fn sample(&mut self, rng: &mut StdRng, index: &[usize]) -> (Vec<usize>, Vec<usize>) {
31 |         let subsample = self.subsample;
32 |         let mut chosen = Vec::new();
33 |         let mut excluded = Vec::new();
34 |         for i in index {
35 |             if rng.random_range(0.0..1.0) < subsample {
36 |                 chosen.push(*i);
37 |             } else {
38 |                 excluded.push(*i)
39 |             }
40 |         }
41 |         (chosen, excluded)
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/shapley.rs:
--------------------------------------------------------------------------------
  1 | use crate::{node::Node, tree::tree::Tree};
  2 | 
  3 | #[derive(Debug, Clone, Copy)]
  4 | struct PathElement {
  5 |     feature_index: usize,
  6 |     zero_fraction: f32,
  7 |     one_fraction: f32,
  8 |     pweight: f32,
  9 | }
 10 | 
 11 | impl Default for PathElement {
 12 |     fn default() -> Self {
 13 |         Self {
 14 |             feature_index: 0,
 15 |             zero_fraction: 0.,
 16 |             one_fraction: 0.,
 17 |             pweight: 0.,
 18 |         }
 19 |     }
 20 | }
 21 | 
 22 | #[derive(Debug, Clone, Default)]
 23 | struct PathList {
 24 |     paths: Vec<PathElement>,
 25 | }
 26 | 
 27 | impl PathList {
 28 |     fn get_element(&mut self, i: usize) -> &PathElement {
 29 |         if i == self.paths.len() {
 30 |             self.paths.push(PathElement::default());
 31 |             &self.paths[i]
 32 |         } else {
 33 |             // This will panic for us, if we are out of bounds.
 34 |             &self.paths[i]
 35 |         }
 36 |     }
 37 |     fn get_element_mut(&mut self, i: usize) -> &mut PathElement {
 38 |         if i == self.paths.len() {
 39 |             self.paths.push(PathElement::default());
 40 |             &mut self.paths[i]
 41 |         } else {
 42 |             // This will panic for us, if we are out of bounds.
 43 |             &mut self.paths[i]
 44 |         }
 45 |     }
 46 |     // fn with_capacity(capacity: usize) -> PathList {
 47 |     //     PathList {
 48 |     //         paths: Vec::with_capacity(capacity),
 49 |     //     }
 50 |     // }
 51 |     // fn with_empty(l: usize) -> PathList {
 52 |     //     PathList {
 53 |     //         paths: vec![PathElement::default(); l],
 54 |     //     }
 55 |     // }
 56 | }
 57 | 
 58 | fn extend_path(
 59 |     unique_path: &mut PathList,
 60 |     unique_depth: usize,
 61 |     zero_fraction: f32,
 62 |     one_fraction: f32,
 63 |     feature_index: usize,
 64 | ) {
 65 |     unique_path.get_element_mut(unique_depth).feature_index = feature_index;
 66 |     unique_path.get_element_mut(unique_depth).zero_fraction = zero_fraction;
 67 |     unique_path.get_element_mut(unique_depth).one_fraction = one_fraction;
 68 |     unique_path.get_element_mut(unique_depth).pweight = if unique_depth == 0 { 1.0 } else { 0.0 };
 69 |     for i in (0..unique_depth).rev() {
 70 |         unique_path.get_element_mut(i + 1).pweight +=
 71 |             (one_fraction * unique_path.get_element(i).pweight * (i + 1) as f32) / (unique_depth + 1) as f32;
 72 |         unique_path.get_element_mut(i).pweight =
 73 |             (zero_fraction * unique_path.get_element(i).pweight * (unique_depth - i) as f32)
 74 |                 / (unique_depth + 1) as f32;
 75 |     }
 76 | }
 77 | 
 78 | fn unwind_path(unique_path: &mut PathList, unique_depth: usize, path_index: usize) {
 79 |     let one_fraction = unique_path.get_element(path_index).one_fraction;
 80 |     let zero_fraction = unique_path.get_element(path_index).zero_fraction;
 81 |     let mut next_one_portion = unique_path.get_element(unique_depth).pweight;
 82 |     for i in (0..unique_depth).rev() {
 83 |         if one_fraction != 0. {
 84 |             let tmp = unique_path.get_element(i).pweight;
 85 |             unique_path.get_element_mut(i).pweight =
 86 |                 (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction);
 87 |             next_one_portion = tmp
 88 |                 - (unique_path.get_element(i).pweight * zero_fraction * (unique_depth - i) as f32)
 89 |                     / (unique_depth + 1) as f32;
 90 |         } else {
 91 |             unique_path.get_element_mut(i).pweight = (unique_path.get_element(i).pweight * (unique_depth + 1) as f32)
 92 |                 / (zero_fraction * (unique_depth - i) as f32);
 93 |         }
 94 |     }
 95 |     for i in path_index..unique_depth {
 96 |         unique_path.get_element_mut(i).feature_index = unique_path.get_element(i + 1).feature_index;
 97 |         unique_path.get_element_mut(i).zero_fraction = unique_path.get_element(i + 1).zero_fraction;
 98 |         unique_path.get_element_mut(i).one_fraction = unique_path.get_element(i + 1).one_fraction;
 99 |     }
100 | }
101 | 
102 | fn unwound_path_sum(unique_path: &mut PathList, unique_depth: usize, path_index: usize) -> f32 {
103 |     let one_fraction = unique_path.get_element(path_index).one_fraction;
104 |     let zero_fraction = unique_path.get_element(path_index).zero_fraction;
105 |     let mut next_one_portion = unique_path.get_element(unique_depth).pweight;
106 |     let mut total = 0.0;
107 |     for i in (0..unique_depth).rev() {
108 |         if one_fraction != 0.0 {
109 |             let tmp = (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction);
110 |             total += tmp;
111 |             next_one_portion = unique_path.get_element(i).pweight
112 |                 - tmp * zero_fraction * ((unique_depth - i) as f32 / (unique_depth + 1) as f32);
113 |         } else if zero_fraction != 0.0 {
114 |             total += (unique_path.get_element(i).pweight / zero_fraction)
115 |                 / ((unique_depth - i) as f32 / (unique_depth + 1) as f32);
116 |         } else if unique_path.get_element(i).pweight != 0.0 {
117 |             panic!("Unique path {} must have zero weight", i);
118 |         }
119 |     }
120 |     total
121 | }
122 | 
123 | fn get_hot_cold_children(next_node_idx: usize, node: &Node) -> Vec<usize> {
124 |     if node.has_missing_branch() {
125 |         // we know there will be 3 children if there is a missing branch.
126 |         if next_node_idx == node.right_child {
127 |             vec![node.right_child, node.left_child, node.missing_node]
128 |         } else if next_node_idx == node.left_child {
129 |             vec![node.left_child, node.right_child, node.missing_node]
130 |         } else {
131 |             vec![node.missing_node, node.left_child, node.right_child]
132 |         }
133 |     } else if next_node_idx == node.right_child {
134 |         vec![node.right_child, node.left_child]
135 |     } else {
136 |         vec![node.left_child, node.right_child]
137 |     }
138 | }
139 | 
140 | #[allow(clippy::too_many_arguments)]
141 | fn tree_shap(
142 |     tree: &Tree,
143 |     row: &[f64],
144 |     contribs: &mut [f64],
145 |     node_index: usize,
146 |     mut unique_depth: usize,
147 |     mut unique_path: PathList,
148 |     parent_zero_fraction: f32,
149 |     parent_one_fraction: f32,
150 |     parent_feature_index: usize,
151 |     missing: &f64,
152 | ) {
153 |     let node = &tree.nodes[&node_index];
154 |     extend_path(
155 |         &mut unique_path,
156 |         unique_depth,
157 |         parent_zero_fraction,
158 |         parent_one_fraction,
159 |         parent_feature_index,
160 |     );
161 |     if node.is_leaf {
162 |         for i in 1..(unique_depth + 1) {
163 |             let w = unwound_path_sum(&mut unique_path, unique_depth, i);
164 |             let el = unique_path.get_element(i);
165 |             contribs[el.feature_index] += f64::from(w * (el.one_fraction - el.zero_fraction) * node.weight_value);
166 |         }
167 |     } else {
168 |         let next_node_idx = node.get_child_idx(&row[node.split_feature], missing);
169 |         let hot_cold_children = get_hot_cold_children(next_node_idx, node);
170 |         let mut incoming_zero_fraction = 1.0;
171 |         let mut incoming_one_fraction = 1.0;
172 | 
173 |         let mut path_index = 0;
174 |         while path_index <= unique_depth {
175 |             if unique_path.get_element(path_index).feature_index == node.split_feature {
176 |                 break;
177 |             }
178 |             path_index += 1;
179 |         }
180 | 
181 |         if path_index != (unique_depth + 1) {
182 |             incoming_zero_fraction = unique_path.get_element(path_index).zero_fraction;
183 |             incoming_one_fraction = unique_path.get_element(path_index).one_fraction;
184 |             unwind_path(&mut unique_path, unique_depth, path_index);
185 |             unique_depth -= 1;
186 |         }
187 | 
188 |         for (i, n_idx) in hot_cold_children.into_iter().enumerate() {
189 |             let zero_fraction = (tree.nodes[&n_idx].hessian_sum / node.hessian_sum) * incoming_zero_fraction;
190 |             let onf = if i == 0 { incoming_one_fraction } else { 0. };
191 |             tree_shap(
192 |                 tree,
193 |                 row,
194 |                 contribs,
195 |                 n_idx,
196 |                 unique_depth + 1,
197 |                 unique_path.clone(),
198 |                 zero_fraction,
199 |                 onf,
200 |                 node.split_feature,
201 |                 missing,
202 |             )
203 |         }
204 |     }
205 | }
206 | 
207 | pub fn predict_contributions_row_shapley(tree: &Tree, row: &[f64], contribs: &mut [f64], missing: &f64) {
208 |     contribs[contribs.len() - 1] += tree.get_average_leaf_weights(0);
209 |     tree_shap(
210 |         tree,
211 |         row,
212 |         contribs,
213 |         0,
214 |         0,
215 |         PathList::default(),
216 |         1.,
217 |         1.,
218 |         row.len() + 100,
219 |         missing,
220 |     )
221 | }
222 | 


--------------------------------------------------------------------------------
/src/tree/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod predict;
2 | pub mod tree;
3 | 


--------------------------------------------------------------------------------