├── treeple
├── _lib
│ ├── __init__.py
│ └── meson.build
├── tests
│ ├── __init__.py
│ ├── meson.build
│ ├── test_neighbors.py
│ ├── test_extensions.py
│ └── test_unsupervised_forest.py
├── stats
│ ├── tests
│ │ ├── __init__.py
│ │ ├── meson.build
│ │ ├── test_permuteforest.py
│ │ ├── test_baseline.py
│ │ └── test_utils.py
│ ├── meson.build
│ └── __init__.py
├── tree
│ ├── honesty
│ │ ├── __init__.py
│ │ ├── meson.build
│ │ └── _honest_prune.pxd
│ ├── manifold
│ │ ├── __init__.py
│ │ ├── meson.build
│ │ └── _morf_splitter.pxd
│ ├── tests
│ │ ├── __init__.py
│ │ ├── meson.build
│ │ └── test_honest_prune.py
│ ├── unsupervised
│ │ ├── __init__.py
│ │ ├── meson.build
│ │ ├── _unsup_oblique_tree.pxd
│ │ ├── _unsup_splitter.pxd
│ │ ├── _unsup_criterion.pxd
│ │ ├── _unsup_tree.pxd
│ │ └── _unsup_oblique_splitter.pxd
│ ├── _sklearn_splitter.pxd
│ ├── _marginal.pxd
│ ├── kernels.py
│ ├── _utils.pxd
│ ├── __init__.py
│ ├── meson.build
│ ├── _oblique_tree.pxd
│ └── _neighbors.py
├── datasets
│ ├── tests
│ │ ├── __init__.py
│ │ └── meson.build
│ ├── meson.build
│ └── __init__.py
├── experimental
│ ├── distributions.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── meson.build
│ │ ├── test_simulate.py
│ │ ├── test_mutual_info.py
│ │ └── test_sdf.py
│ ├── meson.build
│ └── __init__.py
├── conftest.py
├── ensemble
│ ├── meson.build
│ └── __init__.py
├── _build_utils
│ └── gcc_build_bitness.py
├── __init__.py
└── meson.build
├── .codespellignore
├── benchmarks
├── __init__.py
├── utils.py
├── config.json
└── ensemble_supervised.py
├── doc
├── sphinxext
│ ├── MANIFEST.in
│ ├── doi_role.py
│ ├── allow_nan_estimators.py
│ └── github_link.py
├── _templates
│ ├── autosummary
│ │ ├── function.rst
│ │ └── class.rst
│ └── layout.html
├── use.rst
├── user_guide.rst
├── whats_new
│ ├── changelog_legend.inc
│ ├── v0.10.rst
│ ├── v0.9.rst
│ ├── v0.5.rst
│ ├── _contributors.rst
│ ├── v0.3.rst
│ ├── v0.8.rst
│ ├── v0.4.rst
│ ├── v0.2.rst
│ ├── v0.1.rst
│ ├── v0.6.rst
│ └── v0.7.rst
├── whats_new.rst
├── make.bat
├── _static
│ ├── style.css
│ └── versions.json
├── install.rst
├── index.rst
└── modules
│ ├── unsupervised_tree.rst
│ └── ensemble.rst
├── examples
├── README.txt
├── outlier_detection
│ └── README.txt
├── splitters
│ └── README.txt
├── multiview
│ └── README.txt
├── calibration
│ ├── README.txt
│ └── plot_honest_tree.py
├── quantile_predictions
│ ├── README.txt
│ ├── plot_quantile_vs_standard_oblique_forest.py
│ ├── plot_quantile_toy_example_with_RF.py
│ └── plot_quantile_interpolation_with_RF.py
├── sklearn_vs_treeple
│ ├── README.txt
│ └── plot_iris_dtc.py
├── sparse_oblique_trees
│ ├── README.txt
│ ├── plot_oblique_axis_aligned_forests_sparse_parity.py
│ └── plot_oblique_random_forest.py
└── treeple
│ ├── README.txt
│ ├── treeple_tutorial_1_1d_HD.py
│ ├── treeple_tutorial_1_1b_MI.py
│ └── treeple_tutorial_1_1a_SA98.py
├── .github
├── ISSUE_TEMPLATE
│ ├── blank.md
│ ├── feature_request.md
│ └── bug_report.md
├── dependabot.yml
├── label-globs.yml
├── workflows
│ ├── pull_request_labeler.yml
│ ├── cffconvert.yml
│ ├── circle_artifacts.yml
│ ├── style.yml
│ ├── release.yml
│ └── pr_checks.yml
├── FUNDING.yml
└── PULL_REQUEST_TEMPLATE.md
├── test_requirements.txt
├── benchmarks_nonasv
├── README.md
└── bench_plot_urf.py
├── .gitmodules
├── style_requirements.txt
├── .yamllint.yml
├── spin
├── .flake8
├── .gitignore
├── CITATION.cff
├── .pre-commit-config.yaml
├── meson.build
└── Makefile
/treeple/_lib/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/stats/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/tree/honesty/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/tree/manifold/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/tree/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/datasets/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/experimental/distributions.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/experimental/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.codespellignore:
--------------------------------------------------------------------------------
1 | raison
2 | nd
3 | parth
4 | ot
5 | fpr
6 | master
--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | """Benchmark suite for treeple using ASV"""
2 |
--------------------------------------------------------------------------------
/doc/sphinxext/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *.py
2 | include *.txt
3 |
--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | Examples
2 | ========
3 |
4 | Examples demonstrating how to use treeple algorithms.
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/blank.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Blank issue
3 | about: Create an issue without a template.
4 |
5 | ---
6 |
--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | pandas
3 | pytest
4 | pytest-cov
5 | memory_profiler
6 | flaky
7 | tqdm
8 | bottleneck
9 |
--------------------------------------------------------------------------------
/benchmarks_nonasv/README.md:
--------------------------------------------------------------------------------
1 | A set of scripts that can be run to analyze runtime and performance of treeple
2 | estimators.
3 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "github-actions"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "treeple/_lib/sklearn"]
2 | path = treeple/_lib/sklearn_fork
3 | url = https://github.com/neurodata/scikit-learn
4 | branch = submodulev3
5 |
--------------------------------------------------------------------------------
/examples/outlier_detection/README.txt:
--------------------------------------------------------------------------------
1 | .. _outlier_examples:
2 |
3 | Outlier-detection
4 | -----------------
5 |
6 | Examples concerning how to do outlier detection with decision trees.
7 |
--------------------------------------------------------------------------------
/examples/splitters/README.txt:
--------------------------------------------------------------------------------
1 | .. _splitter_examples:
2 |
3 | Decision-tree splitters
4 | -----------------------
5 |
6 | Examples demonstrating different node-splitting strategies for decision trees.
7 |
--------------------------------------------------------------------------------
/style_requirements.txt:
--------------------------------------------------------------------------------
1 | mypy
2 | black
3 | isort
4 | flake8
5 | bandit
6 | pydocstyle
7 | codespell
8 | toml
9 | cython-lint
10 | pre-commit
11 | yamllint
12 | toml-sort
13 | ruff
14 | rstcheck
15 |
--------------------------------------------------------------------------------
/examples/multiview/README.txt:
--------------------------------------------------------------------------------
1 | .. _multiview_examples:
2 |
3 | Multi-view learning with Decision-trees
4 | ---------------------------------------
5 |
6 | Examples demonstrating multi-view learning using random forest variants.
7 |
--------------------------------------------------------------------------------
/examples/calibration/README.txt:
--------------------------------------------------------------------------------
1 | .. _calibration_examples:
2 |
3 | Calibrated decision trees via honesty
4 | -------------------------------------
5 |
6 | Examples demonstrating the usage of honest decision trees to obtain calibrated predictions.
7 |
--------------------------------------------------------------------------------
/examples/quantile_predictions/README.txt:
--------------------------------------------------------------------------------
1 | .. _quantile_examples:
2 |
3 | Quantile Predictions with Random Forest
4 | ---------------------------------------
5 |
6 | Examples demonstrating how to generate quantile predictions using Random Forest variants.
--------------------------------------------------------------------------------
/examples/sklearn_vs_treeple/README.txt:
--------------------------------------------------------------------------------
1 | .. _sklearn_examples:
2 |
3 | Comparing sklearn and treeple decision trees
4 | --------------------------------------------
5 |
6 | Examples demonstrating the difference between sklearn and treeple decision trees.
7 |
--------------------------------------------------------------------------------
/examples/sparse_oblique_trees/README.txt:
--------------------------------------------------------------------------------
1 | .. _sporf_examples:
2 |
3 | Sparse oblique projections with oblique decision-trees
4 | ------------------------------------------------------
5 |
6 | Examples demonstrating learning using oblique random forests.
7 |
--------------------------------------------------------------------------------
/treeple/datasets/tests/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'test_hyppo.py',
4 | 'test_multiview.py',
5 | ]
6 |
7 | py.install_sources(
8 | python_sources,
9 | pure: false,
10 | subdir: 'treeple/datasets/tests'
11 | )
12 |
--------------------------------------------------------------------------------
/treeple/datasets/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'multiview.py',
4 | 'hyppo.py',
5 | ]
6 |
7 | py.install_sources(
8 | python_sources,
9 | pure: false,
10 | subdir: 'treeple/datasets'
11 | )
12 |
13 | subdir('tests')
14 |
--------------------------------------------------------------------------------
/doc/_templates/autosummary/function.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 |
3 | .. currentmodule:: {{ module }}
4 |
5 | .. autofunction:: {{ objname }}
6 |
7 | .. _sphx_glr_backreferences_{{ fullname }}:
8 |
9 | .. minigallery:: {{ fullname }}
10 | :add-heading:
11 |
--------------------------------------------------------------------------------
/treeple/stats/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'forest.py',
4 | 'utils.py',
5 | 'permuteforest.py',
6 | 'baseline.py',
7 | ]
8 |
9 | py.install_sources(
10 | python_sources,
11 | pure: false,
12 | subdir: 'treeple/stats'
13 | )
14 |
15 | subdir('tests')
16 |
--------------------------------------------------------------------------------
/.yamllint.yml:
--------------------------------------------------------------------------------
1 | extends: default
2 |
3 | ignore: |
4 | treeple/_lib/
5 | .asv/
6 |
7 | rules:
8 | line-length: disable
9 | document-start: disable
10 | truthy: disable
11 | comments: disable
12 | braces:
13 | forbid: false
14 | min-spaces-inside: 0
15 | max-spaces-inside: 1
16 |
--------------------------------------------------------------------------------
/treeple/experimental/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'mutual_info.py',
4 | 'simulate.py',
5 | 'sdf.py',
6 | 'monte_carlo.py',
7 | ]
8 |
9 | py.install_sources(
10 | python_sources,
11 | pure: false,
12 | subdir: 'treeple/experimental'
13 | )
14 |
15 | subdir('tests')
16 |
--------------------------------------------------------------------------------
/treeple/experimental/tests/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'test_mutual_info.py',
4 | 'test_simulate.py',
5 | 'test_sdf.py',
6 | 'test_monte_carlo.py',
7 | ]
8 |
9 | py.install_sources(
10 | python_sources,
11 | pure: false,
12 | subdir: 'treeple/experimental/tests'
13 | )
14 |
--------------------------------------------------------------------------------
/doc/use.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | Examples using treeple
4 | ==========================
5 |
6 | To be able to effectively use treeple, look at some of the examples here
7 | to learn everything you need!
8 |
9 | .. rstcheck: ignore-next-code-block
10 | .. include:: auto_examples/index.rst
11 | :start-after: :orphan:
12 |
--------------------------------------------------------------------------------
/treeple/stats/tests/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'test_forest.py',
4 | 'test_baseline.py',
5 | 'test_coleman.py',
6 | 'test_utils.py',
7 | 'test_permuteforest.py',
8 | ]
9 |
10 | py.install_sources(
11 | python_sources,
12 | pure: false,
13 | subdir: 'treeple/stats/tests'
14 | )
15 |
--------------------------------------------------------------------------------
/treeple/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | # With the following global module marker,
4 | # monitoring is disabled by default:
5 | pytestmark = [pytest.mark.monitor_skip_test]
6 |
7 |
8 | def pytest_configure(config):
9 | """Set up pytest markers."""
10 | config.addinivalue_line("markers", "slowtest: mark test as slow")
11 |
--------------------------------------------------------------------------------
/examples/treeple/README.txt:
--------------------------------------------------------------------------------
1 | .. _treeple:
2 |
3 | Treeple for Hypothesis Testing
4 | ------------------------------
5 |
6 | Examples concerning how to use treeple as hypothesis test tools.
7 | Tutorials include estimating true statistics with true posterior functionss,
8 | using forest to calculate statistic estimates, and calculating p-values.
9 |
--------------------------------------------------------------------------------
/treeple/ensemble/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | '_supervised_forest.py',
4 | '_unsupervised_forest.py',
5 | '_honest_forest.py',
6 | '_eiforest.py',
7 | '_multiview.py',
8 | '_extensions.py',
9 | ]
10 |
11 | py.install_sources(
12 | python_sources,
13 | pure: false,
14 | subdir: 'treeple/ensemble'
15 | )
16 |
--------------------------------------------------------------------------------
/.github/label-globs.yml:
--------------------------------------------------------------------------------
1 | Cython:
2 | - treeple/**/*.pyx.*
3 | - treeple/**/*.pxd.*
4 | - treeple/**/*.pxi.*
5 |
6 | C/C++:
7 | - treeple/**/*.c
8 | - treeple/**/*.c.in
9 | - treeple/**/*.c.old
10 | - treeple/**/*.h
11 | - treeple/**/*.h.in
12 | - treeple/**/*.cpp
13 | - treeple/**/*.cc
14 | - treeple/**/*.cxx
15 | - treeple/**/*.hpp
16 |
--------------------------------------------------------------------------------
/doc/user_guide.rst:
--------------------------------------------------------------------------------
1 | .. Places parent toc into the sidebar
2 |
3 | :parenttoc: True
4 |
5 | .. title:: User guide: contents
6 |
7 | .. _user_guide:
8 |
9 | ==========
10 | User Guide
11 | ==========
12 |
13 | .. toctree::
14 | :numbered:
15 | :maxdepth: 3
16 |
17 | modules/supervised_tree
18 | modules/unsupervised_tree
19 | modules/ensemble
20 |
--------------------------------------------------------------------------------
/treeple/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .hyppo import (
2 | approximate_clf_mutual_information,
3 | approximate_clf_mutual_information_with_monte_carlo,
4 | make_marron_wand_classification,
5 | make_quadratic_classification,
6 | make_trunk_classification,
7 | make_trunk_mixture_classification,
8 | )
9 | from .multiview import make_gaussian_mixture, make_joint_factor_model
10 |
--------------------------------------------------------------------------------
/treeple/experimental/__init__.py:
--------------------------------------------------------------------------------
1 | from . import mutual_info, sdf, simulate
2 | from .monte_carlo import conditional_resample
3 | from .mutual_info import (
4 | cmi_from_entropy,
5 | cmi_gaussian,
6 | entropy_gaussian,
7 | entropy_weibull,
8 | mi_from_entropy,
9 | mi_gamma,
10 | mi_gaussian,
11 | mutual_info_ksg,
12 | )
13 | from .sdf import StreamDecisionForest
14 |
--------------------------------------------------------------------------------
/treeple/tree/_sklearn_splitter.pxd:
--------------------------------------------------------------------------------
1 | from .._lib.sklearn.utils._typedefs cimport float32_t, int32_t, intp_t
2 |
3 | # This defines c-importable functions for other cython files
4 |
5 | # TODO: remove these files when sklearn merges refactor defining these in pxd files
6 | # https://github.com/scikit-learn/scikit-learn/pull/25606
7 | cdef void sort(float32_t* Xf, intp_t* samples, intp_t n) noexcept nogil
8 |
--------------------------------------------------------------------------------
/treeple/tests/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'test_supervised_forest.py',
4 | 'test_unsupervised_forest.py',
5 | 'test_neighbors.py',
6 | 'test_honest_forest.py',
7 | 'test_eiforest.py',
8 | 'test_multiview_forest.py',
9 | 'test_extensions.py',
10 | ]
11 |
12 | py.install_sources(
13 | python_sources,
14 | pure: false,
15 | subdir: 'treeple/tests'
16 | )
17 |
--------------------------------------------------------------------------------
/treeple/tree/tests/meson.build:
--------------------------------------------------------------------------------
1 | python_sources = [
2 | '__init__.py',
3 | 'test_tree.py',
4 | 'test_utils.py',
5 | 'test_honest_tree.py',
6 | 'test_honest_prune.py',
7 | 'test_marginal.py',
8 | 'test_all_trees.py',
9 | 'test_unsupervised_tree.py',
10 | 'test_multiview.py',
11 | ]
12 |
13 | py.install_sources(
14 | python_sources,
15 | pure: false,
16 | subdir: 'treeple/tree/tests'
17 | )
--------------------------------------------------------------------------------
/treeple/stats/__init__.py:
--------------------------------------------------------------------------------
1 | from .baseline import build_cv_forest, build_permutation_forest
2 | from .forest import build_coleman_forest, build_oob_forest
3 | from .permuteforest import PermutationHonestForestClassifier
4 |
5 | __all__ = [
6 | "build_cv_forest",
7 | "build_oob_forest",
8 | "build_coleman_forest",
9 | "build_permutation_forest",
10 | "PermutationHonestForestClassifier",
11 | ]
12 |
--------------------------------------------------------------------------------
/treeple/tree/_marginal.pxd:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | cimport numpy as cnp
4 |
5 | from .._lib.sklearn.tree._tree cimport BaseTree, Node
6 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t
7 |
8 |
9 | cpdef apply_marginal_tree(
10 | BaseTree tree,
11 | object X,
12 | const intp_t[:] marginal_indices,
13 | intp_t traversal_method,
14 | uint8_t use_sample_weight,
15 | object random_state
16 | )
17 |
--------------------------------------------------------------------------------
/treeple/tree/kernels.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def gaussian_kernel(shape, sigma=1.0, mu=0.0):
5 | """N-dimensional gaussian kernel for the given shape.
6 |
7 | See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e
8 | """
9 | m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape])
10 | d = np.sqrt(np.sum([x * x for x in m], axis=0))
11 | g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2)))
12 | return g / np.sum(g)
13 |
--------------------------------------------------------------------------------
/spin:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Example stub for running `python -m spin`
4 | #
5 | # Copy this into your project root.
6 |
7 | import os
8 | import runpy
9 | import sys
10 |
11 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
12 | try:
13 | runpy.run_module("spin", run_name="__main__")
14 | except ImportError:
15 | print("Cannot import spin; please install it using")
16 | print()
17 | print(" pip install spin")
18 | print()
19 | sys.exit(1)
20 |
--------------------------------------------------------------------------------
/doc/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
1 | ..
2 | The empty line below should not be removed. It is added such that the `rst_prolog`
3 | is added before the :mod: directive. Otherwise, the rendering will show as a
4 | paragraph instead of a header.
5 |
6 | :mod:`{{module}}`.{{objname}}
7 | {{ underline }}==============
8 |
9 | .. currentmodule:: {{ module }}
10 |
11 | .. autoclass:: {{ objname }}
12 |
13 | .. _sphx_glr_backreferences_{{ fullname }}:
14 |
15 | .. raw:: html
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.github/workflows/pull_request_labeler.yml:
--------------------------------------------------------------------------------
1 | name: "Pull Request Labeler"
2 | on:
3 | pull_request_target:
4 | types: [created]
5 |
6 | permissions:
7 | contents: write # to add labels
8 |
9 | jobs:
10 | label_pull_request:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: thomasjpfan/labeler@v2.5.1
14 | continue-on-error: true
15 | if: github.repository == 'neurodata/treeple'
16 | with:
17 | repo-token: "${{ secrets.GITHUB_TOKEN }}"
18 | configuration-path: ".github/label-globs.yml"
19 |
--------------------------------------------------------------------------------
/treeple/ensemble/__init__.py:
--------------------------------------------------------------------------------
1 | from ._eiforest import ExtendedIsolationForest
2 | from ._honest_forest import HonestForestClassifier
3 | from ._multiview import MultiViewRandomForestClassifier
4 | from ._supervised_forest import (
5 | ExtraObliqueRandomForestClassifier,
6 | ExtraObliqueRandomForestRegressor,
7 | ObliqueRandomForestClassifier,
8 | ObliqueRandomForestRegressor,
9 | PatchObliqueRandomForestClassifier,
10 | PatchObliqueRandomForestRegressor,
11 | )
12 | from ._unsupervised_forest import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest
13 |
--------------------------------------------------------------------------------
/.github/workflows/cffconvert.yml:
--------------------------------------------------------------------------------
1 | name: cffconvert
2 |
3 | on:
4 | push:
5 | paths:
6 | - CITATION.cff
7 | pull_request:
8 | paths:
9 | - CITATION.cff
10 |
11 | jobs:
12 | validate:
13 | name: "validate"
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Check out a copy of the repository
17 | uses: actions/checkout@v4
18 |
19 | - name: Check whether the citation metadata from CITATION.cff is valid
20 | uses: citation-file-format/cffconvert-github-action@2.0.0
21 | with:
22 | args: "--validate"
23 |
--------------------------------------------------------------------------------
/doc/whats_new/changelog_legend.inc:
--------------------------------------------------------------------------------
1 | Legend for changelogs
2 | ---------------------
3 |
4 | - |MajorFeature|: something big that you couldn't do before.
5 | - |Feature|: something that you couldn't do before.
6 | - |Efficiency|: an existing feature now may not require as much computation or
7 | memory.
8 | - |Enhancement|: a miscellaneous minor improvement.
9 | - |Fix|: something that previously didn't work as documentated -- or according
10 | to reasonable expectations -- should now work.
11 | - |API|: you will need to change your code to have the same effect in the
12 | future; or a feature will be removed in the future.
13 |
--------------------------------------------------------------------------------
/treeple/_build_utils/gcc_build_bitness.py:
--------------------------------------------------------------------------------
1 | #!python
2 | """ Detect bitness (32 or 64) of Mingw-w64 gcc build target on Windows.
3 | """
4 |
5 | import re
6 | from subprocess import run
7 |
8 |
9 | def main():
10 | res = run(["gcc", "-v"], check=True, text=True, capture_output=True)
11 | target = re.search(r"^Target: (.*)$", res.stderr, flags=re.M).groups()[0]
12 | if target.startswith("i686"):
13 | print("32")
14 | elif target.startswith("x86_64"):
15 | print("64")
16 | else:
17 | raise RuntimeError("Could not detect Mingw-w64 bitness")
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
22 |
--------------------------------------------------------------------------------
/treeple/tree/honesty/meson.build:
--------------------------------------------------------------------------------
1 | tree_extension_metadata = {
2 | '_honest_prune':
3 | {'sources': ['_honest_prune.pyx'],
4 | 'override_options': ['cython_language=cpp', 'optimization=3']},
5 | }
6 |
7 | foreach ext_name, ext_dict : tree_extension_metadata
8 | py.extension_module(
9 | ext_name,
10 | ext_dict.get('sources'),
11 | dependencies: [np_dep],
12 | override_options : ext_dict.get('override_options', []),
13 | c_args: c_args,
14 | cython_args: cython_c_args,
15 | subdir: 'treeple/tree/honesty',
16 | install: true,
17 | )
18 | endforeach
19 |
20 |
21 | py.install_sources(
22 | subdir: 'treeple/tree/honesty' # Folder relative to site-packages to install to
23 | )
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: 'Feature request'
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/treeple/tree/manifold/meson.build:
--------------------------------------------------------------------------------
1 | tree_extension_metadata = {
2 | '_morf_splitter':
3 | {'sources': ['_morf_splitter.pyx'],
4 | 'override_options': ['cython_language=cpp', 'optimization=3']},
5 | }
6 |
7 | foreach ext_name, ext_dict : tree_extension_metadata
8 | py.extension_module(
9 | ext_name,
10 | ext_dict.get('sources'),
11 | dependencies: [np_dep],
12 | override_options : ext_dict.get('override_options', []),
13 | c_args: c_args,
14 | cython_args: cython_c_args,
15 | subdir: 'treeple/tree/manifold',
16 | install: true,
17 | )
18 | endforeach
19 |
20 |
21 | py.install_sources(
22 | subdir: 'treeple/tree/manifold' # Folder relative to site-packages to install to
23 | )
24 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 |
4 | ignore =
5 | # these rules don't play well with black
6 | # whitespace before ':'
7 | E203
8 | # line break before binary operator
9 | W503
10 | E241,E305,W504,W605,E731
11 | E402
12 |
13 | exclude =
14 | .git
15 | .github
16 | .venv
17 | .mypy_cache
18 | .pytest_cache
19 | .circleci
20 | paper
21 | doc/_build
22 | doc/generated
23 | doc/auto_examples
24 | validation
25 | build
26 | build-install
27 | dist
28 | treeple/_lib/
29 | .asv
30 | env
31 |
32 | per-file-ignores =
33 | # __init__.py files are allowed to have unused imports
34 | */__init__.py:F401
35 | */**/__init__.py:F401
36 |
--------------------------------------------------------------------------------
/doc/whats_new.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. _whats_new:
4 |
5 | .. include:: whats_new/_contributors.rst
6 |
7 | Release History
8 | ===============
9 |
10 | Release notes for all treeple releases are linked in this page.
11 |
12 | **Tip:** `Subscribe to treeple releases `__
13 | on libraries.io to be notified when new versions are released.
14 |
15 | .. toctree::
16 | :maxdepth: 1
17 |
18 | Version 0.1
19 | Version 0.2
20 | Version 0.3
21 | Version 0.4
22 | Version 0.5
23 | Version 0.6
24 | Version 0.7
25 | Version 0.8
26 | Version 0.9
27 | Version 0.10 (Unreleased)
28 |
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Distribution / packaging
2 | .Python
3 | dist/
4 | *.egg*
5 | build
6 | build-install/
7 | coverage
8 | *.xml
9 | .venv
10 | .pymon
11 | .coverage.*
12 |
13 | commit.txt
14 | treeple/_lib/sklearn/
15 |
16 | *.png
17 | _data
18 |
19 | # Sphinx documentation
20 | doc/_build/
21 | doc/generated/
22 | doc/auto_examples/
23 | doc/auto_tutorials/
24 | doc/modules/generated/
25 | doc/sphinxext/cachedir
26 | pip-log.txt
27 | .coverage
28 | tags
29 | doc/coverages
30 | doc/samples
31 | cover
32 | examples/*.jpg
33 | examples/**/*.jpg
34 |
35 | env/
36 | html/
37 | results/
38 | scikit-learn/
39 | benchmarks/cache/
40 |
41 | # Pycharm
42 | .idea/
43 |
44 | *.pyc
45 |
46 | *.so
47 | *.cpp
48 | *.c
49 |
50 | .cache
51 | .pytest_cache
52 | .ipynb_checkpoints
53 | .DS_Store
54 | .vscode/
55 |
56 | __pycache__
57 |
58 | # Profiling
59 | profiling/
60 | *.prof
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=mne_bids
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [adam2392, PSSF23, sampan501, SUKI-O] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: adam2392 # Replace with a single Buy Me a Coffee username
14 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
15 |
--------------------------------------------------------------------------------
/doc/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {%- extends "pydata_sphinx_theme/layout.html" %}
2 |
3 | {% block fonts %}
4 |
5 |
6 |
7 |
8 |
12 | {% endblock %}
13 |
14 | {% block extrahead %}
15 |
16 |
17 |
18 | {{ super() }}
19 | {% endblock %}
20 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
5 |
6 | #### Reference Issues/PRs
7 |
13 |
14 |
15 | #### What does this implement/fix? Explain your changes.
16 |
17 |
18 | #### Any other comments?
19 |
20 |
21 |
31 |
--------------------------------------------------------------------------------
/doc/_static/style.css:
--------------------------------------------------------------------------------
1 | a[class^="sphx-glr-backref-module-scikit_tree"] {
2 | /* make all MNE-BIDS backrefs bold */
3 | font-weight: 800;
4 | }
5 |
6 | /* Disable hyphenation in API reference table for Webkit-based browsers
7 | to work around alignment bug */
8 | #api-documentation table p {
9 | -webkit-hyphens: none;
10 | }
11 |
12 | /* Hide version number from top-left location in the navbar */
13 | .navbar-version {
14 | display: none;
15 | }
16 |
17 | html {
18 | font-size: 16px;
19 | }
20 |
21 | h1 {
22 | font-size: 1.6rem;
23 | }
24 |
25 | h2 {
26 | font-size: 1.3rem;
27 | }
28 |
29 | h3 {
30 | font-size: 1rem;
31 | font-weight: bold;
32 | }
33 |
34 | h4 {
35 | font-size: 1rem;
36 | }
37 |
38 | .footer {
39 | margin-top: 3em;
40 | padding-top: 1em;
41 | }
42 |
43 | /* Links in the Note boxes */
44 | .note a {
45 | color: blue;
46 | text-decoration: underline;
47 | }
48 |
49 | .note a:hover {
50 | color: blue;
51 | font-weight: bold;
52 | text-decoration: underline;
53 | }
54 |
55 | /* Links in "Note" boxes */
56 | .alert-info a code span {
57 | color: blue;
58 | }
59 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.10.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _current:
7 |
8 | Version 0.10
9 | ============
10 |
11 | ** In Development **
12 |
13 | Changelog
14 | ---------
15 |
16 | - |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
17 | ``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)
18 | - |Feature| Added a sparse implementation of `treeple.stats.forest.build_colemen_forest`
19 | that uses the `scipy.sparse` module. By `Ryan Hausen`_ (:pr:`#317`)
20 | - |Feature| :class:`treeple.tree.HonestTreeClassifier` now has a ``honest_method`` parameter
21 | that enables the user to turn on pruning of the tree, such that there are no
22 | empty leaf predictions. This brings the model closer to the implementation in GRF in R.
23 | By `Adam Li`_ (:pr:`#286`)
24 |
25 |
26 | Code and Documentation Contributors
27 | -----------------------------------
28 |
29 | Thanks to everyone who has contributed to the maintenance and improvement of
30 | the project since version inception, including:
31 |
32 | * `Adam Li`_
33 | * `Ryan Hausen`_
34 |
--------------------------------------------------------------------------------
/.github/workflows/circle_artifacts.yml:
--------------------------------------------------------------------------------
1 | name: CircleCI artifacts redirector
2 | on: [status]
3 |
4 | # Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this
5 | # github actions workflow:
6 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication
7 | permissions: read-all
8 |
9 | jobs:
10 | circleci_artifacts_redirector_job:
11 | runs-on: ubuntu-20.04
12 | if: "github.repository == 'neurodata/treeple' && github.event.context == 'ci/circleci: build_docs'"
13 | permissions:
14 | statuses: write
15 | name: Run CircleCI artifacts redirector
16 | steps:
17 | - name: GitHub Action step
18 | uses: larsoner/circleci-artifacts-redirector-action@master
19 | with:
20 | repo-token: ${{ secrets.GITHUB_TOKEN }}
21 | api-token: ${{ secrets.CIRCLECI_TOKEN }}
22 | artifact-path: 0/dev/index.html
23 | circleci-jobs: build_docs
24 | job-title: Check the rendered docs here!
25 |
26 | - name: Check the URL
27 | if: github.event.status != 'pending'
28 | run: |
29 | curl --fail ${{ steps.step1.outputs.url }} | grep $GITHUB_SHA
30 |
--------------------------------------------------------------------------------
/treeple/tree/_utils.pxd:
--------------------------------------------------------------------------------
1 | from libcpp.vector cimport vector
2 |
3 | import numpy as np
4 |
5 | cimport numpy as cnp
6 |
7 | cnp.import_array()
8 |
9 | from .._lib.sklearn.tree._splitter cimport SplitRecord
10 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t
11 |
12 | ctypedef fused vector_or_memview:
13 | vector[intp_t]
14 | intp_t[::1]
15 | intp_t[:]
16 |
17 |
18 | cdef void fisher_yates_shuffle(
19 | vector_or_memview indices_to_sample,
20 | intp_t grid_size,
21 | uint32_t* random_state,
22 | ) noexcept nogil
23 |
24 |
25 | cdef int rand_weighted_binary(
26 | float64_t p0,
27 | uint32_t* random_state
28 | ) noexcept nogil
29 |
30 | cpdef unravel_index(
31 | intp_t index,
32 | cnp.ndarray[intp_t, ndim=1] shape
33 | )
34 |
35 | cpdef ravel_multi_index(
36 | intp_t[:] coords,
37 | const intp_t[:] shape
38 | )
39 |
40 | cdef void unravel_index_cython(
41 | intp_t index,
42 | const intp_t[:] shape,
43 | vector_or_memview coords
44 | ) noexcept nogil
45 |
46 | cdef intp_t ravel_multi_index_cython(
47 | vector_or_memview coords,
48 | const intp_t[:] shape
49 | ) noexcept nogil
50 |
--------------------------------------------------------------------------------
/treeple/experimental/tests/test_simulate.py:
--------------------------------------------------------------------------------
1 | from treeple.experimental.simulate import (
2 | simulate_helix,
3 | simulate_multivariate_gaussian,
4 | simulate_sphere,
5 | )
6 |
7 |
8 | # Test simulate_helix function
9 | def test_simulate_helix():
10 | P, X, Y, Z = simulate_helix(n_samples=1000)
11 | assert len(P) == 1000
12 | assert len(X) == 1000
13 | assert len(Y) == 1000
14 | assert len(Z) == 1000
15 |
16 | # Add more specific tests if necessary
17 |
18 |
19 | # Test simulate_sphere function
20 | def test_simulate_sphere():
21 | latitude, longitude, Y1, Y2, Y3 = simulate_sphere(n_samples=1000)
22 | assert len(latitude) == 1000
23 | assert len(longitude) == 1000
24 | assert len(Y1) == 1000
25 | assert len(Y2) == 1000
26 | assert len(Y3) == 1000
27 |
28 | # Add more specific tests if necessary
29 |
30 |
31 | # Test simulate_multivariate_gaussian function
32 | def test_simulate_multivariate_gaussian():
33 | data, mean, cov = simulate_multivariate_gaussian(d=2, n_samples=1000)
34 | assert data.shape == (1000, 2)
35 | assert mean.shape == (2,)
36 | assert cov.shape == (2, 2)
37 |
38 | # Add more specific tests if necessary
39 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.9.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_9:
7 |
8 | Version 0.9
9 | ===========
10 |
11 | This release include a rename of the package to from ``scikit-tree`` to ``treeple``
12 | The users can replace the previous usage as follows:
13 | ``import sktree`` to ``import treeple``
14 | ``from sktree import tree`` to ``from treeple import tree``
15 | ``from sktree import ...`` to ``from treeple import ...``
16 |
17 | Note that the previous version of the package will still be available under the name ``scikit-tree`` on PyPI.
18 |
19 | Changelog
20 | ---------
21 |
22 | - |API| Rename the package to ``treeple``. By `SUKI-O`_ (:pr:`#292`)
23 | - |Fix| Fixed a bug in the predict_proba function of the :class:`treeple.HonestForestClassifier` where posteriors
24 | estimated on empty leaf with ``ignore`` prior would result in ``np.nan``
25 | values for all trees on that sample.
26 | By `Haoyin Xu`_ (:pr:`#291`)
27 |
28 | Code and Documentation Contributors
29 | -----------------------------------
30 |
31 | Thanks to everyone who has contributed to the maintenance and improvement of
32 | the project since version inception, including:
33 |
34 | * `Adam Li`_
35 | * `SUKI-O`_
36 | * `Haoyin Xu`_
37 |
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/meson.build:
--------------------------------------------------------------------------------
1 | tree_extension_metadata = {
2 | '_unsup_criterion':
3 | {'sources': ['_unsup_criterion.pyx'],
4 | 'override_options': ['cython_language=cpp', 'optimization=3']},
5 | '_unsup_splitter':
6 | {'sources': ['_unsup_splitter.pyx'],
7 | 'override_options': ['cython_language=cpp', 'optimization=3']},
8 | '_unsup_tree':
9 | {'sources': ['_unsup_tree.pyx'],
10 | 'override_options': ['cython_language=cpp', 'optimization=3']},
11 | '_unsup_oblique_splitter':
12 | {'sources': ['_unsup_oblique_splitter.pyx'],
13 | 'override_options': ['cython_language=cpp', 'optimization=3']},
14 | '_unsup_oblique_tree':
15 | {'sources': ['_unsup_oblique_tree.pyx'],
16 | 'override_options': ['cython_language=cpp', 'optimization=3']},
17 | }
18 |
19 | foreach ext_name, ext_dict : tree_extension_metadata
20 | py.extension_module(
21 | ext_name,
22 | ext_dict.get('sources'),
23 | dependencies: [np_dep],
24 | override_options : ext_dict.get('override_options', []),
25 | c_args: c_args,
26 | cython_args: cython_c_args,
27 | subdir: 'treeple/tree/unsupervised',
28 | install: true,
29 | )
30 | endforeach
31 |
32 |
33 | py.install_sources(
34 | subdir: 'treeple/tree/unsupervised' # Folder relative to site-packages to install to
35 | )
36 |
--------------------------------------------------------------------------------
/treeple/tree/__init__.py:
--------------------------------------------------------------------------------
1 | from .._lib.sklearn.tree import (
2 | DecisionTreeClassifier,
3 | DecisionTreeRegressor,
4 | ExtraTreeClassifier,
5 | ExtraTreeRegressor,
6 | )
7 | from ._classes import (
8 | ExtraObliqueDecisionTreeClassifier,
9 | ExtraObliqueDecisionTreeRegressor,
10 | ObliqueDecisionTreeClassifier,
11 | ObliqueDecisionTreeRegressor,
12 | PatchObliqueDecisionTreeClassifier,
13 | PatchObliqueDecisionTreeRegressor,
14 | UnsupervisedDecisionTree,
15 | UnsupervisedObliqueDecisionTree,
16 | )
17 | from ._honest_tree import HonestTreeClassifier
18 | from ._multiview import MultiViewDecisionTreeClassifier
19 | from ._neighbors import compute_forest_similarity_matrix
20 |
21 | __all__ = [
22 | "ExtraObliqueDecisionTreeClassifier",
23 | "ExtraObliqueDecisionTreeRegressor",
24 | "compute_forest_similarity_matrix",
25 | "UnsupervisedDecisionTree",
26 | "UnsupervisedObliqueDecisionTree",
27 | "ObliqueDecisionTreeClassifier",
28 | "ObliqueDecisionTreeRegressor",
29 | "PatchObliqueDecisionTreeClassifier",
30 | "PatchObliqueDecisionTreeRegressor",
31 | "HonestTreeClassifier",
32 | "DecisionTreeClassifier",
33 | "DecisionTreeRegressor",
34 | "ExtraTreeClassifier",
35 | "ExtraTreeRegressor",
36 | "MultiViewDecisionTreeClassifier",
37 | ]
38 |
--------------------------------------------------------------------------------
/.github/workflows/style.yml:
--------------------------------------------------------------------------------
1 | name: "Style checks"
2 |
3 | concurrency:
4 | group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }}
5 | cancel-in-progress: true
6 |
7 | on:
8 | pull_request:
9 | paths:
10 | - "**.py"
11 | - "**.pxd"
12 | - "**.pyx"
13 | push:
14 | branches: [main]
15 | paths:
16 | - "**.py"
17 | tags:
18 | - "v*.*.*"
19 | workflow_dispatch:
20 |
21 | permissions:
22 | contents: read # to fetch code (actions/checkout)
23 |
24 | jobs:
25 | style:
26 | name: Formatting, lint, style, and type-checks
27 | timeout-minutes: 10
28 | runs-on: ubuntu-latest
29 | steps:
30 | - name: Checkout repository
31 | uses: actions/checkout@v4
32 | - name: Setup Python 3.11
33 | uses: actions/setup-python@v5
34 | with:
35 | python-version: "3.11"
36 | architecture: "x64"
37 |
38 | - name: Install packages for Ubuntu
39 | run: |
40 | sudo apt-get update
41 | sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev
42 |
43 | - name: Install dependencies
44 | run: |
45 | pip install --upgrade pip
46 | pip install -r style_requirements.txt
47 |
48 | # check formatting of the code style
49 | - name: Check code formatting
50 | run: make pre-commit
51 |
--------------------------------------------------------------------------------
/doc/_static/versions.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "name": "0.10",
4 | "version": "dev",
5 | "url": "https://docs.neurodata.io/treeple/dev/"
6 | },
7 | {
8 | "name": "0.9",
9 | "version": "0.9",
10 | "url": "https://docs.neurodata.io/treeple/v0.9/"
11 | },
12 | {
13 | "name": "0.8",
14 | "version": "0.8",
15 | "url": "https://docs.neurodata.io/treeple/v0.8/"
16 | },
17 | {
18 | "name": "0.7",
19 | "version": "0.7",
20 | "url": "https://docs.neurodata.io/treeple/v0.7/"
21 | },
22 | {
23 | "name": "0.6",
24 | "version": "0.6",
25 | "url": "https://docs.neurodata.io/treeple/v0.6/"
26 | },
27 | {
28 | "name": "0.5",
29 | "version": "0.5",
30 | "url": "https://docs.neurodata.io/treeple/v0.5/"
31 | },
32 | {
33 | "name": "0.4",
34 | "version": "0.4",
35 | "url": "https://docs.neurodata.io/treeple/v0.4/"
36 | },
37 | {
38 | "name": "0.3",
39 | "version": "0.3",
40 | "url": "https://docs.neurodata.io/treeple/v0.3/"
41 | },
42 | {
43 | "name": "0.2",
44 | "version": "0.2",
45 | "url": "https://docs.neurodata.io/treeple/v0.2/"
46 | },
47 | {
48 | "name": "0.1",
49 | "version": "0.1",
50 | "url": "https://docs.neurodata.io/treeple/v0.1/"
51 | }
52 | ]
53 |
--------------------------------------------------------------------------------
/benchmarks/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import balanced_accuracy_score, r2_score
3 |
4 |
5 | def neg_mean_inertia(X, labels, centers):
6 | return -(np.asarray(X - centers[labels]) ** 2).sum(axis=1).mean()
7 |
8 |
9 | def make_gen_classif_scorers(caller):
10 | caller.train_scorer = balanced_accuracy_score
11 | caller.test_scorer = balanced_accuracy_score
12 |
13 |
14 | def make_gen_reg_scorers(caller):
15 | caller.test_scorer = r2_score
16 | caller.train_scorer = r2_score
17 |
18 |
19 | def neg_mean_data_error(X, U, V):
20 | return -np.sqrt(((X - U.dot(V)) ** 2).mean())
21 |
22 |
23 | def make_dict_learning_scorers(caller):
24 | caller.train_scorer = lambda _, __: (
25 | neg_mean_data_error(
26 | caller.X, caller.estimator.transform(caller.X), caller.estimator.components_
27 | )
28 | )
29 | caller.test_scorer = lambda _, __: (
30 | neg_mean_data_error(
31 | caller.X_val,
32 | caller.estimator.transform(caller.X_val),
33 | caller.estimator.components_,
34 | )
35 | )
36 |
37 |
38 | def explained_variance_ratio(Xt, X):
39 | return np.var(Xt, axis=0).sum() / np.var(X, axis=0).sum()
40 |
41 |
42 | def make_pca_scorers(caller):
43 | caller.train_scorer = lambda _, __: caller.estimator.explained_variance_ratio_.sum()
44 | caller.test_scorer = lambda _, __: (
45 | explained_variance_ratio(caller.estimator.transform(caller.X_val), caller.X_val)
46 | )
47 |
--------------------------------------------------------------------------------
/treeple/tree/meson.build:
--------------------------------------------------------------------------------
1 | tree_extension_metadata = {
2 | '_sklearn_splitter':
3 | {'sources': ['_sklearn_splitter.pyx'],
4 | 'override_options': ['cython_language=cpp', 'optimization=3']},
5 | '_oblique_splitter':
6 | {'sources': ['_oblique_splitter.pyx'],
7 | 'override_options': ['cython_language=cpp', 'optimization=3']},
8 | '_oblique_tree':
9 | {'sources': ['_oblique_tree.pyx'],
10 | 'override_options': ['cython_language=cpp', 'optimization=3']},
11 | '_utils':
12 | {'sources': ['_utils.pyx'],
13 | 'override_options': ['cython_language=cpp', 'optimization=3']},
14 | '_marginal':
15 | {'sources': ['_marginal.pyx'],
16 | 'override_options': ['cython_language=cpp', 'optimization=3']},
17 | }
18 |
19 | foreach ext_name, ext_dict : tree_extension_metadata
20 | py.extension_module(
21 | ext_name,
22 | ext_dict.get('sources'),
23 | dependencies: [np_dep],
24 | override_options : ext_dict.get('override_options', []),
25 | c_args: c_args,
26 | cython_args: cython_c_args,
27 | subdir: 'treeple/tree',
28 | install: true,
29 | )
30 | endforeach
31 |
32 | python_sources = [
33 | '__init__.py',
34 | '_classes.py',
35 | '_multiview.py',
36 | '_neighbors.py',
37 | '_honest_tree.py',
38 | '_marginalize.py',
39 | ]
40 |
41 | py.install_sources(
42 | python_sources,
43 | subdir: 'treeple/tree' # Folder relative to site-packages to install to
44 | )
45 |
46 | subdir('tests')
47 | subdir('unsupervised')
48 | subdir('manifold')
49 | subdir('honesty')
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_oblique_tree.pxd:
--------------------------------------------------------------------------------
1 | # distutils: language = c++
2 |
3 | # Authors: Adam Li
4 | #
5 | # License: BSD 3 clause
6 |
7 | # See _unsup_oblique_tree.pyx for details.
8 |
9 | import numpy as np
10 |
11 | cimport numpy as cnp
12 | from libcpp.vector cimport vector
13 |
14 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
15 | from ..._lib.sklearn.tree._tree cimport Node
16 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
17 | from .._oblique_splitter cimport ObliqueSplitRecord
18 | from ._unsup_tree cimport UnsupervisedTree
19 |
20 |
21 | cdef class UnsupervisedObliqueTree(UnsupervisedTree):
22 | cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors
23 | cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors
24 |
25 | # overridden methods
26 | cdef int _resize_c(
27 | self,
28 | intp_t capacity=*
29 | ) except -1 nogil
30 | cdef int _set_split_node(
31 | self,
32 | SplitRecord* split_node,
33 | Node *node,
34 | intp_t node_id,
35 | ) except -1 nogil
36 | cdef float32_t _compute_feature(
37 | self,
38 | const float32_t[:, :] X_ndarray,
39 | intp_t sample_index,
40 | Node *node
41 | ) noexcept nogil
42 | cdef void _compute_feature_importances(
43 | self,
44 | float64_t[:] importances,
45 | Node* node
46 | ) noexcept nogil
47 |
48 | cpdef cnp.ndarray get_projection_matrix(self)
49 |
--------------------------------------------------------------------------------
/treeple/tree/_oblique_tree.pxd:
--------------------------------------------------------------------------------
1 | # distutils: language = c++
2 |
3 | # Authors: Adam Li
4 | # Chester Huynh
5 | # Parth Vora
6 | #
7 | # License: BSD 3 clause
8 |
9 | # See _oblique_tree.pyx for details.
10 |
11 | import numpy as np
12 |
13 | cimport numpy as cnp
14 | from libcpp.vector cimport vector
15 |
16 | from .._lib.sklearn.tree._splitter cimport SplitRecord
17 | from .._lib.sklearn.tree._tree cimport Node, Tree, TreeBuilder
18 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
19 | from ._oblique_splitter cimport ObliqueSplitRecord
20 |
21 |
22 | cdef class ObliqueTree(Tree):
23 | cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors
24 | cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors
25 |
26 | # overridden methods
27 | cdef int _resize_c(
28 | self,
29 | intp_t capacity=*
30 | ) except -1 nogil
31 | cdef int _set_split_node(
32 | self,
33 | SplitRecord* split_node,
34 | Node *node,
35 | intp_t node_id
36 | ) except -1 nogil
37 | cdef float32_t _compute_feature(
38 | self,
39 | const float32_t[:, :] X_ndarray,
40 | intp_t sample_index,
41 | Node *node
42 | ) noexcept nogil
43 | cdef void _compute_feature_importances(
44 | self,
45 | float64_t[:] importances,
46 | Node* node
47 | ) noexcept nogil
48 |
49 | cpdef cnp.ndarray get_projection_matrix(self)
50 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | # YAML 1.2
2 | ---
3 | # Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/)
4 | cff-version: 1.2.0
5 | title: "treeple: Modern decision-trees compatible with scikit-learn in Python."
6 | abstract: "treeple is a scikit-learn compatible API for building state-of-the-art decision trees. These include unsupervised trees, oblique trees, uncertainty trees, quantile trees and causal trees."
7 | authors:
8 | - given-names: Adam
9 | family-names: Li
10 | affiliation: "Department of Computer Science, Columbia University, New York, NY, USA"
11 | orcid: "https://orcid.org/0000-0001-8421-365X"
12 | - given-names: Sambit
13 | family-names: Panda
14 | affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA"
15 | orcid: "https://orcid.org/0000-0001-8455-4243"
16 | - given-names: Haoyin
17 | family-names: Xu
18 | affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA"
19 | orcid: "https://orcid.org/0000-0001-8235-4950"
20 | - given-names: Itsuki
21 | family-names: Ogihara
22 | affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA"
23 | type: software
24 | repository-code: "https://github.com/neurodata/treeple"
25 | license: 'PolyForm-Noncommercial-1.0.0'
26 | keywords:
27 | - random forest
28 | - oblique trees
29 | - honest forests
30 | - statisical learning
31 | - machine learning
32 | message: >-
33 | Please cite this software using the metadata from
34 | 'preferred-citation' in the CITATION.cff file.
35 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.5.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_5:
7 |
8 | Version 0.5
9 | ===========
10 |
11 | This release includes a number of enhancements and bug fixes, mainly
12 | to the :class:`treeple.tree.MultiViewDecisionTreeClassifier`. Most notably,
13 | the ``max_features`` argument now supports an array of values, which
14 | applies a different ``max_features`` argument per feature view.
15 |
16 | Changelog
17 | ---------
18 |
19 | - |Enhancement| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
20 | rounds up the number of features to split on to the nearest integer when
21 | applying ``max_features`` to each feature view, by `Adam Li`_ (:pr:`#183`).
22 | - |Feature| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
23 | supports an array passed in for ``max_features``, which applies a different
24 | max_features argument per view, by `Adam Li`_ (:pr:`#183`).
25 | - |Fix| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now correctly
26 | handles the case where there is one feature view that is exhausted, and
27 | another that is not for ``apply_max_features_per_feature_set = False``,
28 | by `Adam Li`_ (:pr:`#183`).
29 | - |Fix| ``treeple.stats.FeatureImportanceForestClassifier`` now correctly passes
30 | metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`).
31 |
32 | Code and Documentation Contributors
33 | -----------------------------------
34 |
35 | Thanks to everyone who has contributed to the maintenance and improvement of
36 | the project since version inception, including:
37 |
38 | * `Adam Li`_
39 |
40 |
--------------------------------------------------------------------------------
/doc/whats_new/_contributors.rst:
--------------------------------------------------------------------------------
1 |
2 | ..
3 | This file maps contributor names to their URLs. It should mostly be used
4 | for core contributors, and occasionally for contributors who do not want
5 | their github page to be their URL target. Historically it was used to
6 | hyperlink all contributors' names, and ``:user:`` should now be preferred.
7 | It also defines other ReST substitutions.
8 |
9 | .. role:: raw-html(raw)
10 | :format: html
11 |
12 | .. role:: raw-latex(raw)
13 | :format: latex
14 |
15 | .. |MajorFeature| replace:: :raw-html:`Major Feature` :raw-latex:`{\small\sc [Major Feature]}`
16 | .. |Feature| replace:: :raw-html:`Feature` :raw-latex:`{\small\sc [Feature]}`
17 | .. |Efficiency| replace:: :raw-html:`Efficiency` :raw-latex:`{\small\sc [Efficiency]}`
18 | .. |Enhancement| replace:: :raw-html:`Enhancement` :raw-latex:`{\small\sc [Enhancement]}`
19 | .. |Fix| replace:: :raw-html:`Fix` :raw-latex:`{\small\sc [Fix]}`
20 | .. |API| replace:: :raw-html:`API Change` :raw-latex:`{\small\sc [API Change]}`
21 |
22 |
23 | .. _Adam Li: https://adam2392.github.io
24 | .. _Jong Shin: https://github.com/jshinm
25 | .. _Sambit Panda: https://sampan.me
26 | .. _SUKI-O : https://github.com/SUKI-O
27 | .. _Ronan Perry : https://rflperry.github.io/
28 | .. _Haoyin Xu : https://github.com/PSSF23
29 | .. _Yuxin Bai : https://github.com/YuxinB
30 | .. _Ryan Hausen : https://ryanhausen.github.io
31 |
--------------------------------------------------------------------------------
/benchmarks/config.json:
--------------------------------------------------------------------------------
1 | {
2 | // "regular": Bencharks are run on small to medium datasets. Each benchmark
3 | // is run multiple times and averaged.
4 | // "fast": Benchmarks are run on small to medium datasets. Each benchmark
5 | // is run only once. May provide unstable benchmarks.
6 | // "large_scale": Benchmarks are run on large datasets. Each benchmark is
7 | // run multiple times and averaged. This profile is meant to
8 | // benchmark scalability and will take hours on single core.
9 | // Can be overridden by environment variable SKLBENCH_PROFILE.
10 | "profile": "regular",
11 |
12 | // List of values of n_jobs to use for estimators which accept this
13 | // parameter (-1 means all cores). An empty list means all values from 1 to
14 | // the maximum number of available cores.
15 | // Can be overridden by environment variable SKLBENCH_NJOBS.
16 | "n_jobs_vals": [1],
17 |
18 | // If true, fitted estimators are saved in ./cache/estimators/
19 | // Can be overridden by environment variable SKLBENCH_SAVE_ESTIMATORS.
20 | "save_estimators": false,
21 |
22 | // Commit hash to compare estimator predictions with.
23 | // If null, predictions are not compared.
24 | // Can be overridden by environment variable SKLBENCH_BASE_COMMIT.
25 | "base_commit": null,
26 |
27 | // If false, the predict (resp. transform) method of the estimators won't
28 | // be benchmarked.
29 | // Can be overridden by environment variables SKLBENCH_PREDICT and
30 | // SKLBENCH_TRANSFORM.
31 | "bench_predict": true,
32 | "bench_transform": true
33 | }
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: "Release to PyPI"
2 |
3 | concurrency:
4 | group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }}
5 | cancel-in-progress: true
6 |
7 | on:
8 | release:
9 | types: [published]
10 | workflow_run:
11 | workflows: [Build_Wheels]
12 | branches: [main]
13 | types: [completed] # This ensures it triggers only after the workflow completes
14 | workflow_dispatch:
15 |
16 | permissions:
17 | contents: read
18 |
19 | jobs:
20 | pypi:
21 | runs-on: ubuntu-latest
22 | if: github.event_name == 'release'
23 | permissions:
24 | id-token: write
25 | steps:
26 | - name: Get run ID of "Build_Wheels" workflow
27 | id: get-run-id
28 | run: |
29 | OTHER_REPO="${{ github.repository }}"
30 | WF_NAME="Build_Wheels"
31 | RUN_ID=`gh run --repo ${OTHER_REPO} list --workflow ${WF_NAME} --json databaseId --jq .[0].databaseId`
32 | echo "Detected latest run id of ${RUN_ID} for workflow ${WF_NAME}"
33 | echo "run-id=${RUN_ID}" >> "$GITHUB_OUTPUT"
34 | env:
35 | GH_TOKEN: ${{ github.token }}
36 |
37 | - name: Download artifact from "Build_Wheels" workflow
38 | uses: actions/download-artifact@v4
39 | with:
40 | name: dist # Match name used in build_wheels.yml upload artifact step
41 | path: dist
42 | github-token: ${{ github.token }}
43 | repository: ${{ github.repository }}
44 | run-id: ${{ steps.get-run-id.outputs.run-id }}
45 |
46 | - name: Show downloaded files
47 | run: ls -la
48 |
49 | - name: Publish to PyPI
50 | uses: pypa/gh-action-pypi-publish@release/v1
51 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.3.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_3:
7 |
8 | Version 0.3
9 | ===========
10 |
11 | This release includes a number of bug fixes and enhancements related to hypothesis testing with decision trees.
12 | Moreover, we have added an experimental multi-view decision tree / random forest, which considers multiple views
13 | of the data when building trees. The documentation page has also undergone an organizational overhaul
14 | making it easier for users to find examples related to specific use cases.
15 |
16 | Changelog
17 | ---------
18 | - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`)
19 | - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`)
20 | - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`)
21 | - |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`)
22 | - |Fix| Fixed usage of ``feature_importances_`` property in ``HonestForestClassifier``, by `Adam Li`_ (:pr:`156`)
23 | - |Fix| Fixed ``HonestForestClassifier`` to allow decision-trees from sklearn, albeit with a limited API, by `Adam Li`_ (:pr:`158`)
24 |
25 | Code and Documentation Contributors
26 | -----------------------------------
27 |
28 | Thanks to everyone who has contributed to the maintenance and improvement of
29 | the project since version inception, including:
30 |
31 | * `Adam Li`_
32 | * `Sambit Panda`_
33 | * `Yuxin Bai`_
34 |
--------------------------------------------------------------------------------
/treeple/experimental/tests/test_mutual_info.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def nonlinear_gaussian_with_additive_noise():
5 | """Nonlinear no-noise function with additive Gaussian noise.
6 |
7 | See: https://github.com/BiuBiuBiLL/NPEET_LNC/issues/4
8 | """
9 | # first simulate multivariate Gaussian without noise
10 |
11 | # then add the noise
12 |
13 | # compute MI by computing the H(Y|X) and H(X)
14 | # H(Y|X) = np.log(noise_std)
15 | # H(X) = kNN K-L estimate with large # of samples
16 | pass
17 |
18 |
19 | def main():
20 | d1 = [1, 1, 0]
21 | d2 = [1, 0, 1]
22 | d3 = [0, 1, 1]
23 | mat = [d1, d2, d3]
24 | tmat = np.transpose(mat)
25 | diag = [[3, 0, 0], [0, 1, 0], [0, 0, 1]]
26 | # mean = np.array([0, 0, 0])
27 | cov = np.dot(tmat, np.dot(diag, mat))
28 | print("covariance matrix")
29 | print(cov)
30 | print(tmat)
31 |
32 |
33 | def test_mi():
34 | d1 = [1, 1, 0]
35 | d2 = [1, 0, 1]
36 | d3 = [0, 1, 1]
37 | mat = [d1, d2, d3]
38 | tmat = np.transpose(mat)
39 | diag = [[3, 0, 0], [0, 1, 0], [0, 0, 1]]
40 | # mean = np.array([0, 0, 0])
41 | cov = np.dot(tmat, np.dot(diag, mat))
42 | print("covariance matrix")
43 | print(cov)
44 | trueent = -0.5 * (3 + np.log(8.0 * np.pi * np.pi * np.pi * np.linalg.det(cov)))
45 | trueent += -0.5 * (1 + np.log(2.0 * np.pi * cov[2][2])) # z sub
46 | trueent += 0.5 * (
47 | 2
48 | + np.log(
49 | 4.0 * np.pi * np.pi * np.linalg.det([[cov[0][0], cov[0][2]], [cov[2][0], cov[2][2]]])
50 | )
51 | ) # xz sub
52 | trueent += 0.5 * (
53 | 2
54 | + np.log(
55 | 4.0 * np.pi * np.pi * np.linalg.det([[cov[1][1], cov[1][2]], [cov[2][1], cov[2][2]]])
56 | )
57 | ) # yz sub
58 | print("true CMI(x:y|x)", trueent / np.log(2))
59 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.8.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_8:
7 |
8 | Version 0.8
9 | ===========
10 |
11 | This development fixes a major bug with (CO)MIGHT, where low sample sizes produce biased tree
12 | posteriors, which is fixed by stratifying the sampling of the dataset to ensure that each class
13 | is represented in the bootstrap sample. Additionally, the release includes a number of bug fixes
14 | and improvements to the codebase.
15 |
16 | Changelog
17 | ---------
18 |
19 | - |Fix| Previously missing-values in ``X`` input array for treeple estimators
20 | did not raise an error, and silently ran, assuming the missing-values were
21 | encoded as infinity value. This is now fixed, and the estimators will raise an
22 | ValueError if missing-values are encountered in ``X`` input array.
23 | By `Adam Li`_ (:pr:`#264`)
24 | - |Feature| Simulations in ``treeple.datasets.hyppo`` now throw a warning instead
25 | of an error when the number of samples is less than the number of dimensions.
26 | By `Sambit Panda`_ (:pr:`#279`)
27 | - |API| :class:`treeple.HonestForestClassifier` now has ``bootstrap=True`` as the default
28 | argument. By `Adam Li`_ (:pr:`#274`)
29 | - |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated
30 | MIGHT code. By `Adam Li`_ (:pr:`#274`)
31 | - |Fix| Fixed a bug in the ``treeple.HonestForestClassifier`` where posteriors
32 | estimated on oob samples were biased when there was a low number of samples
33 | due to imbalance in the classes when ``bootstrap=True``.
34 | By `Adam Li`_ (:pr:`#283`)
35 |
36 | Code and Documentation Contributors
37 | -----------------------------------
38 |
39 | Thanks to everyone who has contributed to the maintenance and improvement of
40 | the project since version inception, including:
41 |
42 | * `Adam Li`_
43 | * `Sambit Panda`_
44 |
--------------------------------------------------------------------------------
/doc/sphinxext/doi_role.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | doilinks
4 | ~~~~~~~~
5 | Extension to add links to DOIs. With this extension you can use e.g.
6 | :doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will
7 | create a link to a DOI resolver
8 | (``https://doi.org/10.1016/S0022-2836(05)80360-2``).
9 | The link caption will be the raw DOI.
10 | You can also give an explicit caption, e.g.
11 | :doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`.
12 |
13 | :copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by
14 | the Sphinx team.
15 | :license: BSD.
16 | """
17 |
18 | from docutils import nodes, utils
19 | from sphinx.util.nodes import split_explicit_title
20 |
21 |
22 | def reference_role(typ, rawtext, text, lineno, inliner, options={}, content=[]):
23 | text = utils.unescape(text)
24 | has_explicit_title, title, part = split_explicit_title(text)
25 | if typ in ["arXiv", "arxiv"]:
26 | full_url = "https://arxiv.org/abs/" + part
27 | if not has_explicit_title:
28 | title = "arXiv:" + part
29 | pnode = nodes.reference(title, title, internal=False, refuri=full_url)
30 | return [pnode], []
31 | if typ in ["doi", "DOI"]:
32 | full_url = "https://doi.org/" + part
33 | if not has_explicit_title:
34 | title = "DOI:" + part
35 | pnode = nodes.reference(title, title, internal=False, refuri=full_url)
36 | return [pnode], []
37 |
38 |
39 | def setup_link_role(app):
40 | app.add_role("arxiv", reference_role, override=True)
41 | app.add_role("arXiv", reference_role, override=True)
42 | app.add_role("doi", reference_role, override=True)
43 | app.add_role("DOI", reference_role, override=True)
44 |
45 |
46 | def setup(app):
47 | app.connect("builder-inited", setup_link_role)
48 | return {"version": "0.1", "parallel_read_safe": True}
49 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.4.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_4:
7 |
8 | Version 0.4
9 | ===========
10 |
11 | This version patches some issues with the ``FeatureImportance*`` classes and also adds a feature to the
12 | `MultiViewDecisionTreeClassifier` class that allows one to scale the number of split candidates sampled per feature-set
13 | equally.
14 |
15 | Changelog
16 | ---------
17 |
18 | - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
19 | - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`)
20 | - |Fix| Fixes a bug where ``FeatureImportanceForest*`` was unable to be run when calling ``statistic`` with ``covariate_index`` defined for MI, AUC metrics, by `Adam Li`_ (:pr:`164`)
21 | - |Enhancement| Add :func:`treeple.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`170`)
22 | - |Enhancement| Multi-view trees now are able to scale the sampling of split candidates at the same rate per feature-set now, which means 'sqrt' would sample split candidates equal to the square root of each feature-set size, by `Adam Li`_ (:pr:`152`)
23 | - |FIX| Fixes bug in :class:`treeple.tree.MultiViewDecisionTreeClassifier` where the max_features argument applied over
24 | more than two views with ``apply_max_features_per_set`` set to ``True`` results in an incorrect and oversampled
25 | number of max_features in the views after the first two, by `Adam Li`_ (:pr:`172`)
26 |
27 | Code and Documentation Contributors
28 | -----------------------------------
29 |
30 | Thanks to everyone who has contributed to the maintenance and improvement of
31 | the project since version inception, including:
32 |
33 | * `Adam Li`_
34 |
35 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.2.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_2:
7 |
8 | Version 0.2
9 | ===========
10 |
11 | This release is a major release, with many new features and improvements.
12 | For instance, we have added a new implementation of the extended isolation forest,
13 | enabled all decision trees to take advantage of ``partial_fit`` meaning trees have streaming
14 | capabilities. Moreover, we have added an analogous implementation of extra-trees for oblique-trees.
15 | Finally, this release includes a highly experimental feature for multivariate high-dimensional
16 | hypothesis testing using permutation forests and a feature importance testing forest.
17 |
18 | Changelog
19 | ---------
20 | - |Efficiency| Upgraded build process to rely on Cython 3.0+, by `Adam Li`_ (:pr:`109`)
21 | - |Feature| Allow decision trees to take advantage of ``partial_fit`` and ``monotonic_cst`` when available, by `Adam Li`_ (:pr:`109`)
22 | - |Feature| Implementation of ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor by `SUKI-O`_ (:pr:`75`)
23 | - |Efficiency| Around 1.5-2x speed improvement for unsupervised forests, by `Adam Li`_ (:pr:`114`)
24 | - |API| Allow ``sqrt`` and ``log2`` keywords to be used for ``min_samples_split`` parameter in unsupervised forests, by `Adam Li`_ (:pr:`114`)
25 | - |Feature| Implement extended isolation forest, by `Adam Li`_ (:pr:`101`)
26 | - |Feature| Implementation of StreamDecisionForest, by `Haoyin Xu`_ and `Adam Li`_ (:pr:`116`)
27 | - |Feature| Implementation of Permutation forests and a feature importance testing forest, by `Haoyin Xu`_, `Adam Li`_, `Sambit Panda`_ (:pr:`125`)
28 |
29 | Code and Documentation Contributors
30 | -----------------------------------
31 |
32 | Thanks to everyone who has contributed to the maintenance and improvement of
33 | the project since version inception, including:
34 |
35 | * `Adam Li`_
36 | * `SUKI-O`_
37 | * `Haoyin Xu`_
38 | * `Sambit Panda`_
39 |
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_splitter.pxd:
--------------------------------------------------------------------------------
1 | from ..._lib.sklearn.tree._splitter cimport BaseSplitter, SplitRecord
2 | from ..._lib.sklearn.tree._tree cimport ParentInfo
3 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t
4 | from ._unsup_criterion cimport UnsupervisedCriterion
5 |
6 |
7 | cdef class UnsupervisedSplitter(BaseSplitter):
8 | """
9 | Notable changes wrt scikit-learn:
10 | 1. `weighted_n_node_samples` is used as a stopping criterion and just used to
11 | keep count of the "number of samples (weighted)". All samples have a default weight
12 | of '1'.
13 | 2. `X` array instead of `y` array is stored as the criterions are computed over the X
14 | array.
15 | 3. The feature_values memoryview is a feature vector with shared memory among the splitter
16 | and the criterion object. This enables the splitter to assign values to it within the
17 | `node_split` function and then `criterion` automatically can compute relevant statistics
18 | on the shared memoryview into the array.
19 | """
20 |
21 | # XXX: requires BaseSplitter to not define "criterion"
22 | cdef public UnsupervisedCriterion criterion # criterion computer
23 | cdef const float32_t[:, :] X # feature matrix
24 | cdef intp_t n_total_samples # store the total number of samples
25 |
26 | # Initialization method for unsupervised splitters
27 | cdef int init(
28 | self,
29 | const float32_t[:, :] X,
30 | const float64_t[:] sample_weight
31 | ) except -1
32 |
33 | # Overridden Methods from base class
34 | cdef int node_reset(
35 | self,
36 | intp_t start,
37 | intp_t end,
38 | float64_t* weighted_n_node_samples
39 | ) except -1 nogil
40 | cdef int node_split(
41 | self,
42 | ParentInfo* parent,
43 | SplitRecord* split,
44 | ) except -1 nogil
45 | cdef void node_value(
46 | self,
47 | float64_t* dest
48 | ) noexcept nogil
49 | cdef float64_t node_impurity(
50 | self
51 | ) noexcept nogil
52 |
--------------------------------------------------------------------------------
/doc/sphinxext/allow_nan_estimators.py:
--------------------------------------------------------------------------------
1 | from contextlib import suppress
2 |
3 | from docutils import nodes
4 | from docutils.parsers.rst import Directive
5 | from sklearn.utils import all_estimators
6 | from sklearn.utils._test_common.instance_generator import _construct_instances
7 | from sklearn.utils._testing import SkipTest
8 |
9 |
10 | class AllowNanEstimators(Directive):
11 | @staticmethod
12 | def make_paragraph_for_estimator_type(estimator_type):
13 | intro = nodes.list_item()
14 | intro += nodes.strong(text="Estimators that allow NaN values for type ")
15 | intro += nodes.literal(text=f"{estimator_type}")
16 | intro += nodes.strong(text=":\n")
17 | exists = False
18 | lst = nodes.bullet_list()
19 | for name, est_class in all_estimators(type_filter=estimator_type):
20 | with suppress(SkipTest):
21 | est = _construct_instance(est_class)
22 |
23 | if est._get_tags().get("allow_nan"):
24 | module_name = ".".join(est_class.__module__.split(".")[:2])
25 | class_title = f"{est_class.__name__}"
26 | class_url = f"./generated/{module_name}.{class_title}.html"
27 | item = nodes.list_item()
28 | para = nodes.paragraph()
29 | para += nodes.reference(
30 | class_title, text=class_title, internal=False, refuri=class_url
31 | )
32 | exists = True
33 | item += para
34 | lst += item
35 | intro += lst
36 | return [intro] if exists else None
37 |
38 | def run(self):
39 | lst = nodes.bullet_list()
40 | for i in ["cluster", "regressor", "classifier", "transformer"]:
41 | item = self.make_paragraph_for_estimator_type(i)
42 | if item is not None:
43 | lst += item
44 | return [lst]
45 |
46 |
47 | def setup(app):
48 | app.add_directive("allow_nan_estimators", AllowNanEstimators)
49 |
50 | return {
51 | "version": "0.1",
52 | "parallel_read_safe": True,
53 | "parallel_write_safe": True,
54 | }
55 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.1.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_1:
7 |
8 | Version 0.1
9 | ===========
10 |
11 | Changelog
12 | ---------
13 | - |Feature| Implementation of the two-means Unsupervised Random Forest, by `Adam Li`_ (:pr:`9`)
14 | - |Feature| Implementation of oblique Unsupervised Random Forest, by `Adam Li`_ (:pr:`11`)
15 | - |Feature| Implementation of manifold oblique Random Forest, by `Adam Li`_ (:pr:`21`)
16 | - |Feature| Implementation of fastBIC criterion for unsupervised tree models, by `Adam Li`_ and `Jong Shin`_ (:pr:`45`)
17 | - |Fix| Fix a bug in Patch oblique random forest that samples outside the data boundaries and adds a user guide, by `Adam Li`_ (:pr:`61`)
18 | - |Feature| MORF trees now can sample n-dimensional patches inside an n-dimensional structure sample and make any arbitrary axis discontinuous, by `Adam Li`_ (:pr:`63`)
19 | - |Feature| All tree types can compute similarity and dissimilarity matrices, by `Sambit Panda`_ and `Adam Li`_ (:pr:`64`)
20 | - |Feature| MORF trees now can normalize by feature weight per sample per feature column, by `Adam Li`_ (:pr:`67`)
21 | - |Feature| A general-kernel MORF is now implemented where users can pass in a kernel library, by `Adam Li`_ (:pr:`70`)
22 | - |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
23 | - |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
24 | - |Feature| Implementation of (conditional) mutual information estimation via unsupervised tree models and added NearestNeighborsMetaEstimator by `Adam Li`_ (:pr:`83`)
25 | - |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)
26 |
27 | Code and Documentation Contributors
28 | -----------------------------------
29 |
30 | Thanks to everyone who has contributed to the maintenance and improvement of
31 | the project since version inception, including:
32 |
33 | * `Adam Li`_
34 | * `Sambit Panda`_
35 | * `Ronan Perry`_
36 | * `Haoyin Xu`_
37 |
--------------------------------------------------------------------------------
/treeple/tree/honesty/_honest_prune.pxd:
--------------------------------------------------------------------------------
1 | from ..._lib.sklearn.tree._criterion cimport Criterion
2 | from ..._lib.sklearn.tree._partitioner cimport shift_missing_values_to_left_if_required
3 | from ..._lib.sklearn.tree._splitter cimport SplitRecord, Splitter
4 | from ..._lib.sklearn.tree._tree cimport Node, ParentInfo, Tree
5 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t
6 |
7 |
8 | # for each node, keep track of the node index and the parent index
9 | # within the tree's node array
10 | cdef struct PruningRecord:
11 | intp_t node_idx
12 | intp_t start
13 | intp_t end
14 | float64_t lower_bound
15 | float64_t upper_bound
16 |
17 |
18 | # TODO: this may break the notion of feature importances, as we don't set the node's impurity
19 | # at the child nodes.
20 | cdef class HonestPruner(Splitter):
21 | cdef Tree tree # The tree to be pruned
22 | cdef intp_t capacity # The maximum number of nodes in the pruned tree
23 | cdef intp_t pos # The current position to split left/right children
24 | cdef intp_t n_missing # The number of missing values in the feature currently considered
25 | cdef uint8_t missing_go_to_left
26 |
27 | # TODO: only supports sparse for now.
28 | cdef const float32_t[:, :] X
29 |
30 | cdef int init(
31 | self,
32 | object X,
33 | const float64_t[:, ::1] y,
34 | const float64_t[:] sample_weight,
35 | const uint8_t[::1] missing_values_in_feature_mask,
36 | ) except -1
37 |
38 | # This function is not used, and should be disabled for pruners
39 | cdef int node_split(
40 | self,
41 | ParentInfo* parent_record,
42 | SplitRecord* split,
43 | ) except -1 nogil
44 |
45 | cdef bint check_node_partition_conditions(
46 | self,
47 | SplitRecord* current_split,
48 | float64_t lower_bound,
49 | float64_t upper_bound
50 | ) noexcept nogil
51 |
52 | cdef inline intp_t n_left_samples(
53 | self
54 | ) noexcept nogil
55 | cdef inline intp_t n_right_samples(
56 | self
57 | ) noexcept nogil
58 |
59 | cdef int partition_samples(
60 | self,
61 | intp_t node_idx,
62 | ) noexcept nogil
63 |
--------------------------------------------------------------------------------
/benchmarks/ensemble_supervised.py:
--------------------------------------------------------------------------------
1 | from treeple.ensemble import ObliqueRandomForestClassifier
2 |
3 | from .common import Benchmark, Estimator, Predictor
4 | from .datasets import (
5 | _20newsgroups_highdim_dataset,
6 | _20newsgroups_lowdim_dataset,
7 | _synth_classification_dataset,
8 | )
9 | from .utils import make_gen_classif_scorers
10 |
11 |
12 | class ObliqueRandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):
13 | """
14 | Benchmarks for RandomForestClassifier.
15 | """
16 |
17 | param_names = ["representation", "n_jobs"]
18 | params = (["dense", "sparse"], Benchmark.n_jobs_vals)
19 |
20 | def setup_cache(self):
21 | super().setup_cache()
22 |
23 | def make_data(self, params):
24 | representation, n_jobs = params
25 |
26 | if representation == "sparse":
27 | data = _20newsgroups_highdim_dataset()
28 | else:
29 | data = _20newsgroups_lowdim_dataset()
30 |
31 | return data
32 |
33 | def make_estimator(self, params):
34 | representation, n_jobs = params
35 |
36 | n_estimators = 500 if Benchmark.data_size == "large" else 100
37 |
38 | estimator = ObliqueRandomForestClassifier(
39 | n_estimators=n_estimators,
40 | min_samples_split=10,
41 | max_features="log2",
42 | n_jobs=n_jobs,
43 | random_state=0,
44 | )
45 |
46 | return estimator
47 |
48 | def make_scorers(self):
49 | make_gen_classif_scorers(self)
50 |
51 |
52 | class ObliqueRandomForestClassifierBenchmarkSynth(Predictor, Estimator, Benchmark):
53 | """
54 | Benchmarks for Oblique RF Classifier using synthetic classification data.
55 | """
56 |
57 | param_names = []
58 | params = ()
59 |
60 | def setup_cache(self):
61 | super().setup_cache()
62 |
63 | def make_data(self, params):
64 | data = _synth_classification_dataset(n_samples=10000, n_features=100, n_classes=5)
65 |
66 | return data
67 |
68 | def make_estimator(self, params):
69 | estimator = ObliqueRandomForestClassifier(max_leaf_nodes=15, random_state=0)
70 |
71 | return estimator
72 |
73 | def make_scorers(self):
74 | make_gen_classif_scorers(self)
75 |
--------------------------------------------------------------------------------
/treeple/tree/_neighbors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def compute_forest_similarity_matrix(forest, X):
5 | """Compute the similarity matrix of samples in X using a trained forest.
6 |
7 | As an intermediate calculation, the forest applies the dataset and gets
8 | the leaves for each sample. Then, the similarity matrix is computed by
9 | counting the number of times each pair of samples ends up in the same leaf.
10 |
11 | Parameters
12 | ----------
13 | forest : BaseForest or BaseDecisionTree
14 | The fitted forest.
15 | X : array-like of shape (n_samples, n_features)
16 | The input data.
17 |
18 | Returns
19 | -------
20 | aff_matrix : array-like of shape (n_samples, n_samples)
21 | The estimated distance matrix.
22 | """
23 | if hasattr(forest, "estimator_"):
24 | # apply to the leaves
25 | X_leaves = forest.apply(X)
26 |
27 | n_est = forest.n_estimators
28 | else:
29 | # apply to the leaves for a single tree
30 | X_leaves = forest.apply(X)[:, np.newaxis]
31 | n_est = 1
32 |
33 | aff_matrix = sum(np.equal.outer(X_leaves[:, i], X_leaves[:, i]) for i in range(n_est))
34 | # normalize by the number of trees
35 | aff_matrix = np.divide(aff_matrix, n_est)
36 | return aff_matrix
37 |
38 |
39 | def _compute_distance_matrix(aff_matrix):
40 | """Private function to compute distance matrix after `compute_similarity_matrix`."""
41 | dists = 1.0 - aff_matrix
42 | return dists
43 |
44 |
45 | # ported from https://github.com/neurodata/hyppo/blob/main/hyppo/independence/_utils.py
46 | class SimMatrixMixin:
47 | """Mixin class to calculate similarity and dissimilarity matrices.
48 |
49 | This augments tree/forest models with the sklearn's nearest-neighbors API.
50 | """
51 |
52 | def compute_similarity_matrix(self, X):
53 | """
54 | Compute the similarity matrix of samples in X.
55 |
56 | Parameters
57 | ----------
58 | X : array-like of shape (n_samples, n_features)
59 | The input data.
60 |
61 | Returns
62 | -------
63 | sim_matrix : array-like of shape (n_samples, n_samples)
64 | The similarity matrix among the samples.
65 | """
66 | return compute_forest_similarity_matrix(self, X)
67 |
--------------------------------------------------------------------------------
/treeple/stats/tests/test_permuteforest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from numpy.testing import assert_array_equal
4 | from sklearn import datasets
5 |
6 | from treeple.stats import PermutationHonestForestClassifier
7 |
8 | # load the iris dataset (n_samples, 4)
9 | # and randomly permute it
10 | iris = datasets.load_iris()
11 | seed = 12345
12 | rng = np.random.default_rng(seed)
13 |
14 | # remove third class
15 | iris_X = iris.data[iris.target != 2]
16 | iris_y = iris.target[iris.target != 2]
17 |
18 | p = rng.permutation(iris_X.shape[0])
19 | iris_X = iris_X[p]
20 | iris_y = iris_y[p]
21 |
22 |
23 | def test_permutationforest_errors():
24 | """Test permutation forest errors when training."""
25 | n_samples = 10
26 | est = PermutationHonestForestClassifier(n_estimators=10, random_state=0)
27 |
28 | # covariate index must be an iterable
29 | with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
30 | est.fit(iris_X[:n_samples], iris_y[:n_samples], covariate_index=0)
31 |
32 | # covariate index must be an iterable of ints
33 | with pytest.raises(RuntimeError, match="Not all covariate_index"):
34 | est.fit(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[0, 1.0])
35 |
36 | # covariate index must not have numbers greater than
37 | with pytest.raises(ValueError, match="The length of the covariate index"):
38 | est.fit(
39 | iris_X[:n_samples],
40 | iris_y[:n_samples],
41 | covariate_index=np.arange(iris_X.shape[1] + 1, dtype=np.intp),
42 | )
43 |
44 |
45 | @pytest.mark.parametrize("permute_per_tree", [True, False])
46 | def test_inbag_samples_different_across_forest(permute_per_tree):
47 | """Test that inbag samples are different across trees."""
48 | n_estimators = 10
49 | est = PermutationHonestForestClassifier(
50 | n_estimators=n_estimators, random_state=0, permute_per_tree=permute_per_tree
51 | )
52 |
53 | X = iris_X
54 | y = iris_y
55 | est.fit(X, y)
56 |
57 | # covariate index when None is all the features
58 | assert_array_equal(est.covariate_index_, np.arange(X.shape[1], dtype=np.intp))
59 |
60 | # inbag samples should be different across trees when permute_per_tree=True
61 | permutation_samples_ = est.permutation_indices_
62 | permutation_samples_ground = permutation_samples_[0]
63 | assert not all(
64 | np.array_equal(permutation_samples_ground, permutation_samples_[idx])
65 | for idx in range(1, n_estimators)
66 | )
67 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 24.8.0
4 | hooks:
5 | - id: black
6 | args: [--quiet]
7 |
8 | - repo: https://github.com/pycqa/isort
9 | rev: 5.13.2
10 | hooks:
11 | - id: isort
12 | name: isort (python)
13 | - id: isort
14 | name: isort (cython)
15 | types: [cython]
16 |
17 | - repo: https://github.com/MarcoGorelli/cython-lint
18 | rev: v0.16.2
19 | hooks:
20 | - id: cython-lint
21 | - id: double-quote-cython-strings
22 |
23 | # Ruff treeple
24 | - repo: https://github.com/astral-sh/ruff-pre-commit
25 | rev: v0.6.9
26 | hooks:
27 | - id: ruff
28 | name: ruff treeple
29 | args: ["--fix"]
30 | files: ^treeple/
31 |
32 | # Ruff tutorials and examples
33 | - repo: https://github.com/astral-sh/ruff-pre-commit
34 | rev: v0.6.9
35 | hooks:
36 | - id: ruff
37 | name: ruff tutorials and examples
38 | # D103: missing docstring in public function
39 | # D400: docstring first line must end with period
40 | args: ["--ignore=D103,D400", "--fix"]
41 | files: ^tutorials/|^examples/
42 |
43 | # Codespell
44 | - repo: https://github.com/codespell-project/codespell
45 | rev: v2.3.0
46 | hooks:
47 | - id: codespell
48 | additional_dependencies:
49 | - tomli
50 | files: ^treeple/|^doc/|^examples/|^tutorials/
51 | types_or: [python, bib, rst, inc]
52 |
53 | # yamllint
54 | - repo: https://github.com/adrienverge/yamllint.git
55 | rev: v1.35.1
56 | hooks:
57 | - id: yamllint
58 | args: [--strict, -c, .yamllint.yml]
59 |
60 | # toml-sort
61 | - repo: https://github.com/pappasam/toml-sort
62 | rev: v0.23.1
63 | hooks:
64 | - id: toml-sort
65 | files: ^pyproject\.toml$
66 | args: ['-i']
67 |
68 | # mypy
69 | - repo: https://github.com/pre-commit/mirrors-mypy
70 | rev: v1.11.2
71 | hooks:
72 | - id: mypy
73 | # Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former
74 | exclude: ^(benchmarks_nonasv|examples|benchmarks|.spin)/.*$
75 | additional_dependencies: ["numpy==1.26.2"]
76 |
77 | # rstcheck
78 | - repo: https://github.com/rstcheck/rstcheck.git
79 | rev: v6.2.4
80 | hooks:
81 | - id: rstcheck
82 | additional_dependencies:
83 | - tomli
84 | files: ^(?!doc/use\.rst$).*\.(rst|inc)$
85 |
86 | ci:
87 | autofix_prs: true
88 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.6.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_6:
7 |
8 | Version 0.6
9 | ===========
10 |
11 | This release includes an enhancement mainly in the MultiViewDecisionTreeClassifier
12 | and HonestForestClassifier, and a new generative model for the make_trunk_classification.
13 |
14 | Changelog
15 | ---------
16 |
17 | - |Enhancement| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
18 | rounds up the number of features to split on to the nearest integer when
19 | applying ``max_features`` to each feature view, by `Adam Li`_ (:pr:`#183`).
20 | - |Feature| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
21 | supports an array passed in for ``max_features``, which applies a different
22 | max_features argument per view, by `Adam Li`_ (:pr:`#183`).
23 | - |Fix| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now correctly
24 | handles the case where there is one feature view that is exhausted, and
25 | another that is not for ``apply_max_features_per_feature_set = False``,
26 | by `Adam Li`_ (:pr:`#183`).
27 | - |Fix| ``treeple.stats.FeatureImportanceForestClassifier`` now correctly passes
28 | metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`).
29 | - |Enhancement| :func:`treeple.datasets.make_trunk_classification` now
30 | has a generative model based on Trunk and banded covariance, :func:`treeple.datasets.approximate_clf_mutual_information` and
31 | :func:`treeple.datasets.approximate_clf_mutual_information_with_monte_carlo` to
32 | approximate mutual information either numerically or via Monte-Carlo, by `Adam Li`_ and `Haoyin Xu`_ (:pr:`#199`).
33 | - |Enhancement| :class:`treeple.HonestForestClassifier` now has a fitted
34 | property ``oob_samples_``, which reproduces the sample indices per tree that is out
35 | of bag, by `Adam Li`_ (:pr:`#200`).
36 | - |Enhancement| :class:`treeple.HonestForestClassifier` will allow one to bootstrap sample higher
37 | than the number of samples, controlled by the ``max_samples`` keyword argument by `Adam Li`_ (:pr:`#206`).
38 | - |Feature| :class:`treeple.HonestForestClassifier` now allows one to specify
39 | the number of sub-samples to use for the honest trees without having
40 | to bootstrap sample. This is specified by the ``max_samples`` parameter.
41 | By `Adam Li`_ (:pr:`#210`)
42 |
43 | Code and Documentation Contributors
44 | -----------------------------------
45 |
46 | Thanks to everyone who has contributed to the maintenance and improvement of
47 | the project since version inception, including:
48 |
49 | * `Adam Li`_
50 | * `Haoyin Xu`_
51 |
--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
1 | project(
2 | 'treeple',
3 | 'c', 'cpp', 'cython',
4 | # Note that the git commit hash cannot be added dynamically here
5 | # That only happens when importing from a git repository.
6 | # See `treeple/__init__.py`
7 | version: '0.10.3',
8 | license: 'PolyForm Noncommercial 1.0.0',
9 | meson_version: '>= 1.1.0',
10 | default_options: [
11 | 'c_std=c11',
12 | 'cpp_std=c++14',
13 | ],
14 | )
15 |
16 | cc = meson.get_compiler('c')
17 | cpp = meson.get_compiler('cpp')
18 |
19 | # Check compiler is recent enough (see "Toolchain Roadmap" for details)
20 | if cc.get_id() == 'gcc'
21 | if not cc.version().version_compare('>=8.0')
22 | error('treeple requires GCC >= 8.0')
23 | endif
24 | elif cc.get_id() == 'msvc'
25 | if not cc.version().version_compare('>=19.20')
26 | error('treeple requires at least vc142 (default with Visual Studio 2019) ' + \
27 | 'when building with MSVC')
28 | endif
29 | endif
30 |
31 | # Suppress warning for deprecated Numpy API.
32 | # Replace with numpy_nodepr_api after Cython 3.0 is out
33 | # '-Wno-maybe-uninitialized'
34 | # numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION'
35 |
36 | # (Suppress warning messages emitted by #warning directives).
37 | _global_c_args = cc.get_supported_arguments(
38 | '-Wno-unused-but-set-variable',
39 | '-Wno-unused-function',
40 | '-Wno-conversion',
41 | '-Wno-misleading-indentation',
42 | '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION',
43 | )
44 | add_project_arguments(_global_c_args, language : 'c')
45 |
46 | # We need -lm for all C code (assuming it uses math functions, which is safe to
47 | # assume for treeple). For C++ it isn't needed, because libstdc++/libc++ is
48 | # guaranteed to depend on it. For Fortran code, Meson already adds `-lm`.
49 | m_dep = cc.find_library('m', required : false)
50 | if m_dep.found()
51 | add_project_link_arguments('-lm', language : 'c')
52 | endif
53 |
54 | cython = find_program(
55 | 'cython',
56 | required: true
57 | )
58 | if not cython.found()
59 | error('MESON_BUILD_FAILED: Cython3 not found. Please install it.')
60 | endif
61 |
62 | # r = run_command('git', 'submodule', 'update', '--init', check: false)
63 | r = run_command('mv', 'treeple/_lib/sklearn_fork/sklearn', 'treeple/_lib/sklearn', check: false)
64 |
65 | # Setup Python:
66 | # https://mesonbuild.com/Python-module.html
67 | py = import('python').find_installation(pure: false)
68 |
69 | # print some debugging output
70 | message(py.full_path())
71 | message(py.get_install_dir())
72 | if py.language_version().version_compare('<3.9')
73 | error('At least Python 3.9 is required.')
74 | endif
75 |
76 | subdir('treeple')
77 |
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_criterion.pxd:
--------------------------------------------------------------------------------
1 | # cython: boundscheck=False
2 | # cython: wraparound=False
3 | # cython: language_level=3
4 |
5 | from ..._lib.sklearn.tree._criterion cimport BaseCriterion
6 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t
7 |
8 | # Note: This class is an exact copy of scikit-learn's Criterion
9 | # class, with the exception of the type of the internal structure.
10 | # In scikit-learn, they store a buffer for the y-labels, whereas here
11 | # we store a buffer for the X dataset.
12 | #
13 | # In our criterions, we do not store the 'y-labels' because there are none
14 | # in unsupervised learning. We instead store a memview of the dataset 'X'.
15 |
16 |
17 | cdef class UnsupervisedCriterion(BaseCriterion):
18 | """Abstract unsupervised criterion.
19 |
20 | Notable Changes
21 | ---------------
22 | 1. weighted_n_* : This parameter keeps track of the total "weight" of the samples
23 | in the node, left and right
24 | """
25 |
26 | # The criterion computes the impurity of a node and the reduction of
27 | # impurity of a split on that node. It also computes the output statistics.
28 |
29 | # Internal structures
30 | cdef const float32_t[:] feature_values # 1D memview for the feature vector to compute criterion on
31 |
32 | # Keep running total of Xf[samples[start:end]] and the corresponding sum in
33 | # the left and right node. For example, this can then efficiently compute the
34 | # mean of the node, and left/right child by subtracting relevant Xf elements
35 | # and then dividing by the total number of samples in the node and left/right child.
36 | cdef float64_t sum_total # The sum of the weighted count of each feature.
37 | cdef float64_t sum_left # Same as above, but for the left side of the split
38 | cdef float64_t sum_right # Same as above, but for the right side of the split
39 |
40 | cdef float64_t sumsq_total # The sum of the weighted count of each feature.
41 | cdef float64_t sumsq_left # Same as above, but for the left side of the split
42 | cdef float64_t sumsq_right # Same as above, but for the right side of the split
43 |
44 | # Methods
45 | # -------
46 | # The 'init' method is copied here with the almost the exact same signature
47 | # as that of supervised learning criterion in scikit-learn to ensure that
48 | # Unsupervised criterion can be used with splitter and tree methods.
49 | cdef intp_t init(
50 | self,
51 | const float32_t[:] feature_values,
52 | const float64_t[:] sample_weight,
53 | float64_t weighted_n_samples,
54 | const intp_t[:] samples,
55 | ) except -1 nogil
56 |
57 | cdef void init_feature_vec(
58 | self
59 | ) noexcept nogil
60 |
61 | cdef void set_sample_pointers(
62 | self,
63 | intp_t start,
64 | intp_t end
65 | ) noexcept nogil
66 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: 'bug'
6 | assignees: ''
7 |
8 | ---
9 |
10 |
14 |
15 | ## Checklist
16 |
17 |
18 |
19 | - [ ] I have verified that the issue exists against the `main` branch.
20 | - [ ] I have read the relevant section in the [contribution guide](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#bug-reports-and-feature-requests) on reporting bugs.
21 | - [ ] I have checked the [issues list](https://github.com/py-why/pywhy-graphs/issues) for similar or identical bug reports.
22 | - [ ] I have checked the [pull requests list](https://github.com/py-why/pywhy-graphs/pulls) for existing proposed fixes.
23 | - [ ] I have checked the [CHANGELOG](https://github.com/py-why/pywhy-graphs/blob/main/CHANGELOG.md) and the [commit log](https://github.com/py-why/pywhy-graphs/commits/main) to find out if the bug was already fixed in the main branch.
24 | - [ ] I have included in the "Description" section below a traceback from any exceptions related to this bug.
25 | - [ ] I have included in the "Related issues or possible duplicates" section beloew all related issues and possible duplicate issues (If there are none, check this box anyway).
26 | - [ ] I have included in the "Environment" section below the name of the operating system and Python version that I was using when I discovered this bug.
27 | - [ ] I have included in the "Environment" section below the output of `pip freeze`.
28 | - [ ] I have included in the "Steps to reproduce" section below a minimally reproducible example.
29 |
30 |
31 | ## Description
32 |
33 |
34 |
35 |
36 | Python traceback:
37 |
38 |
39 |
40 | ```
41 | ```
42 |
43 |
44 |
45 |
46 |
47 | ## Related issues or possible duplicates
48 |
49 | - None
50 |
51 |
52 | ## Environment
53 |
54 |
55 | OS:
56 |
57 |
58 | Python version:
59 |
60 |
61 | Output of pip freeze:
62 |
63 |
64 |
65 | ```
66 | ```
67 |
68 |
69 |
70 |
71 |
72 | ## Steps to reproduce
73 |
74 |
75 |
76 | Example source:
77 |
78 |
79 |
80 | ```
81 | ```
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # simple makefile to simplify repetitive build env management tasks under posix
2 |
3 | # caution: testing won't work on windows, see README
4 |
5 | PYTHON ?= python
6 | PYTESTS ?= pytest
7 | CTAGS ?= ctags
8 | CODESPELL_SKIPS ?= "*.fif,*.eve,*.gz,*.tgz,*.zip,*.mat,*.stc,*.label,*.w,*.bz2,*.annot,*.sulc,*.log,*.local-copy,*.orig_avg,*.inflated_avg,*.gii,*.pyc,*.doctree,*.pickle,*.inv,*.png,*.edf,*.touch,*.thickness,*.nofix,*.volume,*.defect_borders,*.mgh,lh.*,rh.*,COR-*,FreeSurferColorLUT.txt,*.examples,.xdebug_mris_calc,bad.segments,BadChannels,*.hist,empty_file,*.orig,*.js,*.map,*.ipynb,searchindex.dat,plot_*.rst,*.rst.txt,*.html,gdf_encodes.txt,treeple/_lib/*,doc/auto_examples/*"
9 | CODESPELL_DIRS ?= treeple/ doc/ examples/ benchmarks/
10 | all: clean inplace test test-doc
11 |
12 | clean-pyc:
13 | find . -name "*.pyc" | xargs rm -f
14 |
15 | clean-build:
16 | rm -rf build
17 | rm -rf dist
18 |
19 | clean-cache:
20 | find . -name "__pycache__" | xargs rm -rf
21 |
22 | clean: clean-build clean-pyc clean-cache
23 |
24 | pytest: test
25 |
26 | test: in
27 | rm -f .coverage
28 | $(PYTESTS) treeple
29 |
30 | test-doc: sample_data testing_data
31 | $(PYTESTS) --doctest-modules --doctest-ignore-import-errors --doctest-glob='*.rst' ./doc/
32 |
33 | flake:
34 | @if command -v flake8 > /dev/null; then \
35 | echo "Running flake8"; \
36 | flake8 --count treeple examples; \
37 | else \
38 | echo "flake8 not found, please install it!"; \
39 | exit 1; \
40 | fi;
41 | @echo "flake8 passed"
42 |
43 | black:
44 | @if command -v black > /dev/null; then \
45 | echo "Running black"; \
46 | black treeple examples; \
47 | else \
48 | echo "black not found, please install it!"; \
49 | exit 1; \
50 | fi;
51 | @echo "black passed"
52 |
53 | isort:
54 | @if command -v isort > /dev/null; then \
55 | echo "Running isort"; \
56 | isort treeple examples doc; \
57 | else \
58 | echo "isort not found, please install it!"; \
59 | exit 1; \
60 | fi;
61 | @echo "isort passed"
62 |
63 | codespell: # running manually
64 | @codespell -w -i 3 -q 3 -S $(CODESPELL_SKIPS) --ignore-words=.codespellignore $(CODESPELL_DIRS)
65 |
66 | codespell-error: # running on travis
67 | @codespell -i 0 -q 7 -S $(CODESPELL_SKIPS) --ignore-words=.codespellignore $(CODESPELL_DIRS)
68 |
69 | pydocstyle:
70 | @echo "Running pydocstyle"
71 | @pydocstyle mne
72 |
73 | docstyle: pydocstyle
74 |
75 | build-doc:
76 | @echo "Building documentation"
77 | make -C doc/ clean
78 | make -C doc/ html
79 | cd doc/ && make view
80 |
81 | build-doc-noplot:
82 | @echo "Building documentation"
83 | make -C doc/ clean
84 | make -C doc/ html-noplot
85 | cd doc/ && make view
86 |
87 | run-checks:
88 | isort --check .
89 | black --check treeple examples
90 | flake8 .
91 | mypy ./treeple
92 | @$(MAKE) pydocstyle
93 | @$(MAKE) codespell-error
94 | ruff .
95 | toml-sort ./pyproject.toml --check
96 | yamllint . -c .yamllint.yml --strict
97 |
98 | pre-commit:
99 | @pre-commit run -a
--------------------------------------------------------------------------------
/doc/sphinxext/github_link.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import os
3 | import subprocess
4 | import sys
5 | from functools import partial
6 | from operator import attrgetter
7 |
8 | REVISION_CMD = "git rev-parse --short HEAD"
9 |
10 |
11 | def _get_git_revision():
12 | try:
13 | revision = subprocess.check_output(REVISION_CMD.split()).strip()
14 | except (subprocess.CalledProcessError, OSError):
15 | print("Failed to execute git to get revision")
16 | return None
17 | return revision.decode("utf-8")
18 |
19 |
20 | def _linkcode_resolve(domain, info, package, url_fmt, revision):
21 | """Determine a link to online source for a class/method/function
22 |
23 | This is called by sphinx.ext.linkcode
24 |
25 | An example with a long-untouched module that everyone has
26 | >>> _linkcode_resolve('py', {'module': 'tty',
27 | ... 'fullname': 'setraw'},
28 | ... package='tty',
29 | ... url_fmt='http://hg.python.org/cpython/file/'
30 | ... '{revision}/Lib/{package}/{path}#L{lineno}',
31 | ... revision='xxxx')
32 | 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
33 | """
34 |
35 | if revision is None:
36 | return
37 | if domain not in ("py", "pyx"):
38 | return
39 | if not info.get("module") or not info.get("fullname"):
40 | return
41 |
42 | class_name = info["fullname"].split(".")[0]
43 | module = __import__(info["module"], fromlist=[class_name])
44 | obj = attrgetter(info["fullname"])(module)
45 |
46 | # Unwrap the object to get the correct source
47 | # file in case that is wrapped by a decorator
48 | obj = inspect.unwrap(obj)
49 |
50 | try:
51 | fn = inspect.getsourcefile(obj)
52 | except Exception:
53 | fn = None
54 | if not fn:
55 | try:
56 | fn = inspect.getsourcefile(sys.modules[obj.__module__])
57 | except Exception:
58 | fn = None
59 | if not fn:
60 | return
61 |
62 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__))
63 | try:
64 | lineno = inspect.getsourcelines(obj)[1]
65 | except Exception:
66 | lineno = ""
67 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno)
68 |
69 |
70 | def make_linkcode_resolve(package, url_fmt):
71 | """Returns a linkcode_resolve function for the given URL format
72 |
73 | revision is a git commit reference (hash or name)
74 |
75 | package is the name of the root module of the package
76 |
77 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
78 | 'blob/{revision}/{package}/'
79 | '{path}#L{lineno}')
80 | """
81 | revision = _get_git_revision()
82 | return partial(_linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt)
83 |
--------------------------------------------------------------------------------
/doc/install.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | Installation
4 | ============
5 |
6 | Dependencies
7 | ------------
8 |
9 | * ``numpy`` (>=1.23)
10 | * ``scipy`` (>=1.5.0)
11 | * ``scikit-learn`` (>=1.3)
12 | * ``joblib`` (>=1.0.0)
13 | * ``matplotlib`` (optional)
14 |
15 | **treeple** supports Python >= 3.9.
16 |
17 | Installing with ``pip``
18 | -----------------------
19 |
20 | **treeple** is available on `PyPI `_. Just run
21 |
22 | .. code-block:: bash
23 |
24 | pip install treeple
25 |
26 | Installing from source with Meson
27 | ---------------------------------
28 |
29 | To install **treeple** from source, first clone the `repository `_:
30 |
31 | .. code-block:: bash
32 |
33 | git clone https://github.com/neurodata/treeple.git
34 | cd treeple
35 |
36 | # ideally, you should always start within a virtual environment
37 | conda create -n sklearn-dev python=3.9
38 | conda activate sklearn-dev
39 |
40 | Then run installation of build packages
41 |
42 | .. code-block:: bash
43 |
44 | pip install -r build_requirements.txt
45 | pip install spin
46 |
47 | # use spin CLI to run Meson build locally
48 | ./spin build -j 2
49 |
50 | # you can now run tests
51 | ./spin test
52 |
53 | via pip, you will be able to install in editable mode (pending Meson-Python support).
54 |
55 | .. code-block:: bash
56 |
57 | pip install -e .
58 |
59 | # if editing Cython files
60 | pip install --verbose --no-build-isolation --editable .
61 |
62 | .. code-block:: bash
63 |
64 | pip install --user -U https://api.github.com/repos/neurodata/treeple/zipball/master
65 |
66 | Conda (Recommended)
67 | -------------------
68 | First, create a virtual environment using Conda.
69 |
70 | conda create -n sklearn-dev python=3.9
71 |
72 | # activate the virtual environment and install necessary packages to build from source
73 |
74 | conda activate sklearn-dev
75 | conda install -c conda-forge numpy scipy cython joblib threadpoolctl pytest compilers llvm-openmp
76 |
77 | Next, `treeple` from source:
78 |
79 | pip install .[build]
80 |
81 | # if editing Cython files
82 | pip install --verbose --no-build-isolation --editable .
83 |
84 | To install the package from github, clone the repository and then `cd` into the directory.:
85 |
86 | ./spin build
87 |
88 | # if you would like an editable install of treeple for dev purposes
89 | pip install --verbose --no-build-isolation --editable .
90 |
91 | pip install https://api.github.com/repos/neurodata/treeple/zipball/main
92 |
93 |
94 | pip install https://api.github.com/repos/neurodata/scikit-learn/zipball/obliquepr
95 |
96 | Note that currently, we need to build the development version of scikit-learn with oblique trees within this `PR `_.
97 |
98 | Checkout this PR code, and build from source, using scikit-learn's build from source page instructions.
99 |
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | **treeple**
2 | ===================
3 | treeple is a package for modern tree-based algorithms for supervised and unsupervised
4 | learning problems. It extends the robust API of `scikit-learn `_
5 | for tree algorithms that achieve strong performance in benchmark tasks.
6 |
7 | Our package has implemented unsupervised forests (Geodesic Forests
8 | [Madhyastha2020]_), oblique random forests (SPORF [Tomita2020]_, manifold random forests,
9 | MORF [Li2023]_), honest forests [Perry2021]_, extended isolation forests [Hariri2019]_, and more.
10 |
11 | For all forests, we also support incremental building of the forests, using the
12 | ``partial_fit`` API from scikit-learn [Xu2022]_, and quantile regression by storing
13 | the training samples in the leaves of the trees [Meinshausen2006]_ (Warning: high memory usage
14 | will occur in this setting since predicting quantiles stores the training data within the
15 | leaves of the tree).
16 |
17 | We encourage you to use the package for your research and also build on top
18 | with relevant Pull Requests. See our examples for walk-throughs of how to use the package.
19 | Also, see our `contributing guide `_.
20 |
21 | We are licensed under PolyForm Noncommercial License (see `License `_).
22 |
23 | .. topic:: References
24 |
25 | .. [Hariri2019] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
26 | "Extended isolation forest." IEEE transactions on knowledge and data
27 | engineering 33.4 (2019): 1479-1489.
28 |
29 | .. [Meinshausen2006] Meinshausen, Nicolai, and Greg Ridgeway. "Quantile regression forests."
30 | Journal of machine learning research 7.6 (2006). "Quantile regression forests."
31 |
32 | .. [Madhyastha2020] Madhyastha, Meghana, et al. :doi:`"Geodesic Forests"
33 | <10.1145/3394486.3403094>`, KDD 2020, 513-523, 2020.
34 |
35 | .. [Tomita2020] Tomita, Tyler M., et al. "Sparse Projection Oblique
36 | Randomer Forests", The Journal of Machine Learning Research, 21(104),
37 | 1-39, 2020.
38 |
39 | .. [Li2023] Li, Adam, et al. :doi:`"Manifold Oblique Random Forests: Towards
40 | Closing the Gap on Convolutional Deep Networks" <10.1137/21M1449117>`,
41 | SIAM Journal on Mathematics of Data Science, 5(1), 77-96, 2023.
42 |
43 | .. [Perry2021] Perry, Ronan, et al. :arxiv:`"Random Forests for Adaptive
44 | Nearest Neighbor Estimation of Information-Theoretic Quantities"
45 | <1907.00325>`, arXiv preprint arXiv:1907.00325, 2021.
46 |
47 | .. [Xu2022] Xu, Haoyin, et al. :arxiv:`"Simplest Streaming Trees"
48 | <2110.08483>`, arXiv preprint arXiv:2110.08483, 2022.
49 |
50 | Contents
51 | --------
52 |
53 | .. toctree::
54 | :maxdepth: 2
55 | :caption: Getting started:
56 |
57 | api
58 | User Guide
59 | whats_new
60 | install
61 | use
62 |
63 | Indices and tables
64 | ------------------
65 |
66 | * :ref:`genindex`
67 | * :ref:`modindex`
68 |
--------------------------------------------------------------------------------
/doc/modules/unsupervised_tree.rst:
--------------------------------------------------------------------------------
1 | .. _unsupervised_tree:
2 |
3 | ===========================
4 | Unsupervised Decision Trees
5 | ===========================
6 |
7 | .. currentmodule:: sklearn.tree
8 |
9 | In unsupervised learning, the goal is to identify patterns
10 | or structure in data without using labeled examples. Clustering is a common
11 | unsupervised learning technique that groups similar examples together
12 | based on their features. Unsupervised tree models are an adaptive way of generating
13 | clusters of samples. For information on supervised tree models, see :ref:`supervised_tree`
14 |
15 | In this guide, we overview the :ref:`unsup_criterion` used for splitting unsupervised trees,
16 | and methods for evaluating the quality of the tree model in :ref:`unsup_evaluation`.
17 |
18 | .. _unsup_criterion:
19 |
20 | Unsupervised Criterion
21 | ----------------------
22 |
23 | Unsupervised tree models use a variety of criteria to split nodes.
24 |
25 | Two-Means
26 | ~~~~~~~~~
27 |
28 | The two means split finds the cutpoint that minimizes the one-dimensional
29 | 2-means objective, which is finding the cutoff point where the total variance
30 | from cluster 1 and cluster 2 are minimal.
31 |
32 | .. math::
33 | \min_s \sum_{i=1}^s (x_i - \hat{\mu}_1)^2 + \sum_{i=s+1}^N (x_i - \hat{\mu}_2)^2
34 |
35 | where x is a N-dimensional feature vector, N is the number of sample_indices and
36 | the \mu terms are the estimated means of each cluster 1 and 2.
37 |
38 | Fast-BIC
39 | ~~~~~~~~
40 |
41 | The Bayesian Information Criterion (BIC) is a popular model seleciton
42 | criteria that is based on the log likelihood of the model given data.
43 | Fast-BIC :footcite:`Meghana2019_geodesicrf` is a method that combines the speed of the
44 | :class:`sklearn.cluster.KMeans` clustering method with the model flexibility
45 | of Mclust-BIC. It sorts data for each feature and tries all possible splits to
46 | assign data points to one of two Gaussian distributions based on their position
47 | relative to the split.
48 | The parameters for each cluster are estimated using maximum likelihood
49 | estimation (MLE).The method performs hard clustering rather than soft
50 | clustering like in GMM, resulting in a simpler calculation of the likelihood.
51 |
52 | .. math::
53 |
54 | \hat{L} = \sum_{n=1}^s[\log\hat{\pi}_1+\log{\mathcal{N}(x_n;\hat{\mu}_1,\hat{\sigma}_1^2)}]
55 | + \sum_{n=s+1}^N[\log\hat{\pi}_2+\log{\mathcal{N}(x_n;\hat{\mu}_2,\hat{\sigma}_2^2)}]
56 |
57 | where the prior, mean, and variance are defined as follows, respectively:
58 |
59 | .. math::
60 |
61 | \hat{\pi} = \frac{s}{N},\quad\quad
62 | \hat{\mu} = \frac{1}{s}\sum_{n\le s}{x_n},\quad\quad
63 | \hat{\sigma}^2 = \frac{1}{s}\sum_{n\le s}{||x_n-\hat{\mu_j}||^2}
64 |
65 | .. _unsup_evaluation:
66 |
67 | Evaluating Unsupervised Trees
68 | -----------------------------
69 |
70 | In clustering settings, there may be no natural
71 | notion of “true” class-labels, thus the efficacy of the clustering scheme is
72 | often measured based on distance based metrics such as :func:`sklearn.metrics.adjusted_rand_score`.
73 |
74 | .. topic:: References
75 |
76 | .. footbibliography::
77 |
--------------------------------------------------------------------------------
/doc/whats_new/v0.7.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | .. include:: _contributors.rst
4 | .. currentmodule:: treeple
5 |
6 | .. _v0_7:
7 |
8 | Version 0.7
9 | ===========
10 |
11 | This release adds the ability to separate in-bag and out-of-bag samples for
12 | any forest model. We also introduce a new class for fitting honest forests while
13 | permuting the covariate index, and a new set of simulations based on Marron and Wand 1992.
14 |
15 | In addition, various patches were made in terms of how to use scikit-tree for hypothesis
16 | testing of feature sets.
17 |
18 | Changelog
19 | ---------
20 |
21 | - |Feature| Introduce a new light-weight class for fitting honest forests while
22 | permuting the covariate index :class:`treeple.stats.PermutationHonestForestClassifier`,
23 | by `Adam Li`_ (:pr:`#211`)
24 | - |Feature| Introduce a new class method ``predict_proba_per_tree`` for all
25 | Forest classifiers, which will predict the probability per tree and keep the
26 | output as a ``(n_estimators, n_samples, n_classes)`` output,
27 | by `Adam Li`_ (:pr:`#211`)
28 | - |Feature| Introduce a new class fitted attribute ``oob_samples_`` for all
29 | Forest models, which will keep track of the samples used.
30 | by `Adam Li`_ (:pr:`#211`)
31 | - |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
32 | by `Sambit Panda`_ (:pr:`#203`)
33 | - |Feature| :func:`treeple.stats.build_coleman_forest` and :func:`treeple.stats.build_permutation_forest`
34 | are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`)
35 | - |API| :func:`treeple.datasets.make_trunk_classification` for generating trunk mixture and Marron-Wand
36 | simulations are separated out into :func:`treeple.datasets.make_marron_wand_classification` and
37 | :func:`treeple.datasets.make_trunk_mixture_classification`, `Adam Li`_ (:pr:`#227`)
38 | - |API| :class:`treeple.HonestForestClassifier` and :class:`treeple.tree.HonestTreeClassifier`
39 | now overwrite all parameters set by the underlying ``tree_estimator`` and allow you to directly
40 | pass any extra parameters that ``tree_estimator`` has compared to the original
41 | :class:`~sklearn.tree.DecisionTreeClassifier`, `Adam Li`_ (:pr:`#228`)
42 | - |Fix| Trunk simulators now correctly generate random values with a fixed seed,
43 | by `Sambit Panda`_ (:pr:`#236`)
44 | - |Fix| Trunk simulators now correctly generate random values with a fixed seed,
45 | by `Sambit Panda`_ (:pr:`#236`)
46 | - |Efficiency| All scikit-tree estimators are now at least 2X faster than they were
47 | in previous versions. This was due to adding in compiler-directives to turn on
48 | optimizations '-03' when compiling the C++ generated code from Cython. In addition,
49 | we explicitly turned off bounds-checking and related runtime checks in the Cython code,
50 | which would lead to performance degradation during runtime. by `Adam Li`_ (:pr:`#242`)
51 |
52 | Code and Documentation Contributors
53 | -----------------------------------
54 |
55 | Thanks to everyone who has contributed to the maintenance and improvement of
56 | the project since version inception, including:
57 |
58 | * `Adam Li`_
59 | * `Sambit Panda`_
60 |
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_tree.pxd:
--------------------------------------------------------------------------------
1 | # Authors: Adam Li
2 | # Jong Shin
3 | #
4 |
5 | # License: BSD 3 clause
6 |
7 | # See _unsup_tree.pyx for details.
8 |
9 | import numpy as np
10 |
11 | cimport numpy as cnp
12 |
13 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
14 | from ..._lib.sklearn.tree._tree cimport BaseTree, Node, ParentInfo
15 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
16 | from ._unsup_splitter cimport UnsupervisedSplitter
17 |
18 |
19 | # TODO: copy changes from https://github.com/scikit-learn/scikit-learn/pull/25540/files
20 | cdef class UnsupervisedTree(BaseTree):
21 | # The Tree object is a binary tree structure constructed by the
22 | # TreeBuilder. The tree structure is used for predictions and
23 | # feature importances.
24 | #
25 | # Inner structures: values are stored separately from node structure,
26 | # since size is determined at runtime.
27 | # cdef float64_t* value # (capacity) array of values
28 | # cdef intp_t value_stride # = 1
29 |
30 | # Input/Output layout
31 | cdef public intp_t n_features # Number of features in X
32 |
33 | # Methods
34 | cdef cnp.ndarray _get_value_ndarray(self)
35 | cdef cnp.ndarray _get_node_ndarray(self)
36 |
37 | # Overridden Methods
38 | cdef int _set_split_node(
39 | self,
40 | SplitRecord* split_node,
41 | Node* node,
42 | intp_t node_id
43 | ) except -1 nogil
44 | cdef float32_t _compute_feature(
45 | self,
46 | const float32_t[:, :] X_ndarray,
47 | intp_t sample_index,
48 | Node *node
49 | ) noexcept nogil
50 | cdef void _compute_feature_importances(
51 | self,
52 | cnp.float64_t[:] importances,
53 | Node* node
54 | ) noexcept nogil
55 |
56 | # =============================================================================
57 | # Tree builder
58 | # =============================================================================
59 |
60 | cdef class UnsupervisedTreeBuilder:
61 | # The TreeBuilder recursively builds a Tree object from training samples,
62 | # using a Splitter object for splitting internal nodes and assigning
63 | # values to leaves.
64 | #
65 | # This class controls the various stopping criteria and the node splitting
66 | # evaluation order, e.g. depth-first or best-first.
67 |
68 | cdef UnsupervisedSplitter splitter # Splitting algorithm
69 |
70 | cdef intp_t min_samples_split # Minimum number of samples in an internal node
71 | cdef intp_t min_samples_leaf # Minimum number of samples in a leaf
72 | cdef float64_t min_weight_leaf # Minimum weight in a leaf
73 | cdef intp_t max_depth # Maximal tree depth
74 | cdef float64_t min_impurity_decrease # Impurity threshold for early stopping
75 |
76 | cpdef build(
77 | self,
78 | UnsupervisedTree tree,
79 | object X,
80 | const float64_t[:] sample_weight=*
81 | )
82 | cdef _check_input(
83 | self,
84 | object X,
85 | const float64_t[:] sample_weight
86 | )
87 |
--------------------------------------------------------------------------------
/treeple/tree/tests/test_honest_prune.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from treeple.tree import HonestTreeClassifier
4 |
5 |
6 | def test_honest_tree_pruning():
7 | """Test honest tree with pruning to ensure no empty leaves."""
8 | rng = np.random.default_rng(1234)
9 |
10 | n_samples = 1000
11 | X = rng.standard_normal(size=(n_samples, 100))
12 | X[n_samples // 2 :] *= -1
13 | y = [0] * (n_samples // 2) + [1] * (n_samples // 2)
14 |
15 | clf = HonestTreeClassifier(honest_method="prune", max_features="sqrt", random_state=0)
16 | clf = clf.fit(X, y)
17 |
18 | nonprune_clf = HonestTreeClassifier(
19 | honest_method="apply", max_features="sqrt", random_state=0, honest_prior="ignore"
20 | )
21 | nonprune_clf = nonprune_clf.fit(X, y)
22 |
23 | assert (
24 | nonprune_clf.tree_.max_depth >= clf.tree_.max_depth
25 | ), f"{nonprune_clf.tree_.max_depth} <= {clf.tree_.max_depth}"
26 | # assert np.all(clf.tree_.children_left != -1)
27 |
28 | # Access the original and pruned trees' attributes
29 | original_tree = nonprune_clf.tree_
30 | pruned_tree = clf.tree_
31 |
32 | # Ensure the pruned tree has fewer or equal nodes
33 | assert (
34 | pruned_tree.node_count < original_tree.node_count
35 | ), "Pruned tree has more nodes than the original tree"
36 |
37 | # Ensure the pruned tree has no empty leaves
38 | assert np.all(pruned_tree.value.sum(axis=(1, 2)) > 0), pruned_tree.value.sum(axis=(1, 2))
39 | # assert np.all(original_tree.value.sum(axis=(1,2)) > 0), original_tree.value.sum(axis=(1,2))
40 | assert np.all(pruned_tree.value.sum(axis=(1, 2)) > 0) > np.all(
41 | original_tree.value.sum(axis=(1, 2)) > 0
42 | )
43 |
44 | # test that the first three nodes are the same, since these are unlikely to be
45 | # pruned, and should remain invariant.
46 | #
47 | # Note: pruning the tree will have the node_ids change since the tree is
48 | # ordered via DFS.
49 | for pruned_node_id in range(3):
50 | pruned_left_child = pruned_tree.children_left[pruned_node_id]
51 | pruned_right_child = pruned_tree.children_right[pruned_node_id]
52 |
53 | # Check if the pruned node exists in the original tree
54 | assert (
55 | pruned_left_child in original_tree.children_left
56 | ), "Left child node of pruned tree not found in original tree"
57 | assert (
58 | pruned_right_child in original_tree.children_right
59 | ), "Right child node of pruned tree not found in original tree"
60 |
61 | # Check if the node's parameters match for non-leaf nodes
62 | if pruned_left_child != -1:
63 | assert (
64 | pruned_tree.feature[pruned_node_id] == original_tree.feature[pruned_node_id]
65 | ), "Feature does not match for node {}".format(pruned_node_id)
66 | assert (
67 | pruned_tree.threshold[pruned_node_id] == original_tree.threshold[pruned_node_id]
68 | ), "Threshold does not match for node {}".format(pruned_node_id)
69 | assert (
70 | pruned_tree.weighted_n_node_samples[pruned_node_id]
71 | == original_tree.weighted_n_node_samples[pruned_node_id]
72 | ), "Weighted n_node samples does not match for node {}".format(pruned_node_id)
73 |
--------------------------------------------------------------------------------
/treeple/tests/test_neighbors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from sklearn.datasets import make_blobs, make_classification
4 | from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
5 | from sklearn.neighbors import NearestNeighbors
6 | from sklearn.tree import (
7 | DecisionTreeClassifier,
8 | DecisionTreeRegressor,
9 | ExtraTreeClassifier,
10 | ExtraTreeRegressor,
11 | )
12 | from sklearn.utils.estimator_checks import parametrize_with_checks
13 |
14 | from treeple.ensemble import (
15 | ObliqueRandomForestClassifier,
16 | PatchObliqueRandomForestClassifier,
17 | UnsupervisedObliqueRandomForest,
18 | UnsupervisedRandomForest,
19 | )
20 | from treeple.neighbors import NearestNeighborsMetaEstimator
21 |
22 | FORESTS = [
23 | ObliqueRandomForestClassifier,
24 | PatchObliqueRandomForestClassifier,
25 | UnsupervisedRandomForest,
26 | UnsupervisedObliqueRandomForest,
27 | ]
28 |
29 |
30 | @pytest.mark.parametrize("forest", FORESTS)
31 | def test_similarity_matrix(forest):
32 | n_samples = 200
33 | n_classes = 2
34 | n_features = 5
35 |
36 | X, y = make_blobs(
37 | n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=12345
38 | )
39 |
40 | clf = forest(random_state=12345)
41 | clf.fit(X, y)
42 | sim_mat = clf.compute_similarity_matrix(X)
43 |
44 | assert sim_mat.shape == (n_samples, n_samples)
45 | assert np.allclose(sim_mat, sim_mat.T)
46 | assert np.all((sim_mat.diagonal() == 1))
47 |
48 |
49 | @pytest.fixture
50 | def sample_data():
51 | # Generate sample data for testing
52 | X, y = make_classification(n_samples=100, n_features=10, random_state=42)
53 | return X, y
54 |
55 |
56 | @pytest.mark.parametrize(
57 | "estimator",
58 | [
59 | DecisionTreeClassifier(random_state=0),
60 | DecisionTreeRegressor(random_state=0),
61 | ExtraTreeClassifier(random_state=0),
62 | ExtraTreeRegressor(random_state=0),
63 | RandomForestClassifier(random_state=0, n_estimators=10),
64 | ExtraTreesClassifier(random_state=0, n_estimators=10),
65 | ],
66 | )
67 | def test_nearest_neighbors_meta_estimator(sample_data, estimator):
68 | X, y = sample_data
69 | estimator.fit(X, y)
70 |
71 | meta_estimator = NearestNeighborsMetaEstimator(estimator)
72 |
73 | # Fit the meta-estimator
74 | meta_estimator.fit(X, y)
75 |
76 | # Test the fitted estimator attribute
77 | assert hasattr(meta_estimator, "estimator_")
78 |
79 | # Test the nearest neighbors estimator
80 | assert isinstance(meta_estimator.neigh_est_, NearestNeighbors)
81 |
82 | # Test the kneighbors method
83 | neigh_dist, neigh_ind = meta_estimator.kneighbors()
84 | assert neigh_dist.shape == (X.shape[0], meta_estimator.n_neighbors)
85 | assert neigh_ind.shape == (X.shape[0], meta_estimator.n_neighbors)
86 |
87 | # Test the radius_neighbors method
88 | neigh_dist, neigh_ind = meta_estimator.radius_neighbors(radius=0.5)
89 | assert neigh_dist.shape == (X.shape[0],)
90 | assert neigh_ind.shape == (X.shape[0],)
91 |
92 |
93 | @parametrize_with_checks(
94 | [
95 | NearestNeighborsMetaEstimator(DecisionTreeClassifier(random_state=0)),
96 | ]
97 | )
98 | def test_sklearn_compatible_transformer(estimator, check):
99 | check(estimator)
100 |
--------------------------------------------------------------------------------
/examples/treeple/treeple_tutorial_1_1d_HD.py:
--------------------------------------------------------------------------------
1 | """
2 | ==============================
3 | Calculating Hellinger Distance
4 | ==============================
5 | """
6 |
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | import seaborn as sns
10 |
11 | from treeple.datasets import make_trunk_classification
12 | from treeple.ensemble import HonestForestClassifier
13 | from treeple.stats import build_oob_forest
14 |
15 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
16 | PALETTE = sns.color_palette("Set1")
17 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
18 | sns.set_style("white", {"axes.edgecolor": "#dddddd"})
19 |
20 | # %%
21 | # Hellinger Distance
22 | # ------------------
23 | #
24 | # Hellinger distance quantifies the similarity between the two posterior
25 | # probability distributions (class zero and class one).
26 | #
27 | # .. math:: H(\eta(X), 1-\eta(X)) = \frac{1}{\sqrt{2}} \; \bigl\|\sqrt{\eta(X)} - \sqrt{1-\eta(X)} \bigr\|_2
28 | #
29 | # With a binary class simulation as an example, this tutorial will show
30 | # how to use ``treeple`` to calculate the statistic.
31 |
32 | # %%
33 | # Create a simulation with two gaussians
34 | # --------------------------------------
35 |
36 |
37 | # create a binary class simulation with two gaussians
38 | # 500 samples for each class, class zero is standard
39 | # gaussian, and class one has a mean at one
40 | X, y = make_trunk_classification(
41 | n_samples=1000,
42 | n_dim=1,
43 | mu_0=0,
44 | mu_1=1,
45 | n_informative=1,
46 | seed=1,
47 | )
48 |
49 |
50 | fig, ax = plt.subplots(figsize=(6, 6))
51 | fig.tight_layout()
52 | ax.tick_params(labelsize=15)
53 |
54 | # histogram plot the samples
55 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
56 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
57 | ax.set_xlabel("Variable One", fontsize=15)
58 | ax.set_ylabel("Likelihood", fontsize=15)
59 | plt.legend(frameon=False, fontsize=15)
60 | plt.show()
61 |
62 | # %%
63 | # Fit the model
64 | # -------------
65 |
66 |
67 | # initialize the forest with 100 trees
68 | est = HonestForestClassifier(
69 | n_estimators=100,
70 | max_samples=1.6,
71 | max_features=0.3,
72 | bootstrap=True,
73 | stratify=True,
74 | random_state=1,
75 | )
76 |
77 | # fit the model and obtain the tree posteriors
78 | _, observe_proba = build_oob_forest(est, X, y)
79 |
80 | # generate forest posteriors for the two classes
81 | observe_proba = np.nanmean(observe_proba, axis=0)
82 |
83 |
84 | fig, ax = plt.subplots(figsize=(6, 6))
85 | fig.tight_layout()
86 | ax.tick_params(labelsize=15)
87 |
88 | # histogram plot the posterior probabilities for class one
89 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
90 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
91 | ax.set_ylabel("# of Samples", fontsize=15)
92 | ax.set_xlabel("Class One Posterior", fontsize=15)
93 | plt.legend(frameon=False, fontsize=15)
94 | plt.show()
95 |
96 | # %%
97 | # Calculate the statistic
98 | # -----------------------
99 |
100 |
101 | def Calculate_hd(y_pred_proba) -> float:
102 | return np.sqrt(
103 | np.sum((np.sqrt(y_pred_proba[:, 1]) - np.sqrt(y_pred_proba[:, 0])) ** 2)
104 | ) / np.sqrt(2)
105 |
106 |
107 | hd = Calculate_hd(observe_proba)
108 | print("Hellinger distance =", round(hd, 2))
109 |
--------------------------------------------------------------------------------
/examples/sklearn_vs_treeple/plot_iris_dtc.py:
--------------------------------------------------------------------------------
1 | """
2 | =======================================================================
3 | Plot the decision surface of decision trees trained on the iris dataset
4 | =======================================================================
5 |
6 | Plot the decision surface of a decision tree and oblique decision tree
7 | trained on pairs of features of the iris dataset.
8 |
9 | See :ref:`decision tree ` for more information on the estimators.
10 |
11 | For each pair of iris features, the decision tree learns axis-aligned decision
12 | boundaries made of combinations of simple thresholding rules inferred from
13 | the training samples. The oblique decision tree learns oblique decision boundaries
14 | made from linear combinations of the features in the training samples and then
15 | the same thresholding rule as regular decision trees.
16 |
17 | We also show the tree structure of a model built on all of the features.
18 | """
19 |
20 | import matplotlib.pyplot as plt
21 | import numpy as np
22 | from sklearn.datasets import load_iris
23 | from sklearn.inspection import DecisionBoundaryDisplay
24 |
25 | from treeple._lib.sklearn.tree import DecisionTreeClassifier, plot_tree
26 | from treeple.tree import ObliqueDecisionTreeClassifier
27 |
28 | # %%
29 | # First load the copy of the Iris dataset shipped with scikit-learn:
30 | iris = load_iris()
31 |
32 | # Parameters
33 | n_classes = 3
34 | plot_colors = "ryb"
35 | plot_step = 0.02
36 |
37 | clf_labels = ["Axis-aligned", "Oblique"]
38 | random_state = 123456
39 |
40 | clfs = [
41 | DecisionTreeClassifier(random_state=random_state),
42 | ObliqueDecisionTreeClassifier(random_state=random_state),
43 | ]
44 |
45 | for clf, clf_label in zip(clfs, clf_labels):
46 | fig, axes = plt.subplots(2, 3)
47 | axes = axes.flatten()
48 |
49 | for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
50 | # We only take the two corresponding features
51 | X = iris.data[:, pair]
52 | y = iris.target
53 |
54 | # Train
55 | clf.fit(X, y)
56 |
57 | # Plot the decision boundary
58 | ax = axes[pairidx]
59 | plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
60 | DecisionBoundaryDisplay.from_estimator(
61 | clf,
62 | X,
63 | cmap=plt.cm.RdYlBu,
64 | response_method="predict",
65 | ax=ax,
66 | xlabel=iris.feature_names[pair[0]],
67 | ylabel=iris.feature_names[pair[1]],
68 | )
69 |
70 | # Plot the training points
71 | for i, color in zip(range(n_classes), plot_colors):
72 | idx = np.where(y == i)
73 | ax.scatter(
74 | X[idx, 0],
75 | X[idx, 1],
76 | c=color,
77 | label=iris.target_names[i],
78 | cmap=plt.cm.RdYlBu,
79 | edgecolor="black",
80 | s=15,
81 | )
82 |
83 | fig.suptitle(f"Decision surface of {clf_label} decision trees trained on pairs of features")
84 | plt.legend(loc="lower right", borderpad=0, handletextpad=0)
85 | _ = plt.axis("tight")
86 | plt.show()
87 |
88 | # %%
89 | # Display the structure of a single decision tree trained on all the features
90 | # together.
91 |
92 | for clf, clf_label in zip(clfs, clf_labels):
93 | plt.figure()
94 | clf.fit(iris.data, iris.target)
95 | plot_tree(clf, filled=True)
96 | plt.title(f"{clf_label} decision tree trained on all the iris features")
97 | plt.show()
98 |
--------------------------------------------------------------------------------
/.github/workflows/pr_checks.yml:
--------------------------------------------------------------------------------
1 | name: "PR Checks"
2 |
3 | concurrency:
4 | group: ${{ github.workflow }}-${{ github.ref }}
5 | cancel-in-progress: true
6 |
7 | on:
8 | pull_request:
9 | branches:
10 | - main
11 | paths:
12 | - "treeple/**"
13 |
14 | jobs:
15 | changelog:
16 | name: CHANGELOG
17 | runs-on: ubuntu-latest
18 | # if: github.event_name == 'pull_request'
19 | if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }}
20 | steps:
21 | - name: Get PR number and milestone
22 | run: |
23 | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
24 | echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV
25 | - uses: actions/checkout@v4
26 | with:
27 | fetch-depth: "0"
28 | - name: Check that CHANGELOG has been updated
29 | run: |
30 | # If this step fails, this means you haven't updated the CHANGELOG.md
31 | # file with notes on your contribution.
32 | # git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!"
33 | set -xe
34 | changed_files=$(git diff --name-only origin/main)
35 | # Changelog should be updated only if tests have been modified
36 | if [[ ! "$changed_files" =~ tests ]]
37 | then
38 | exit 0
39 | fi
40 | all_changelogs=$(cat ./doc/whats_new/v*.rst)
41 | if [[ "$all_changelogs" =~ :pr:\`#$PR_NUMBER\` ]]
42 | then
43 | echo "Changelog has been updated."
44 | # If the pull request is milestoned check the correspondent changelog
45 | if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst
46 | then
47 | expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst)
48 | if [[ "$expected_changelog" =~ :pr:\`#$PR_NUMBER\` ]]
49 | then
50 | echo "Changelog and milestone correspond."
51 | else
52 | echo "Changelog and milestone do not correspond."
53 | echo "If you see this error make sure that the tagged milestone for the PR"
54 | echo "and the edited changelog filename properly match."
55 | exit 1
56 | fi
57 | fi
58 | else
59 | echo "A Changelog entry is missing for :pr:\`#$PR_NUMBER\`"
60 | echo ""
61 | echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'"
62 | echo "to document your change assuming that the PR will be merged"
63 | echo "in time for the next release of treeple."
64 | echo ""
65 | echo "Look at other entries in that file for inspiration and please"
66 | echo "reference this pull request using the ':pr:' directive and"
67 | echo "credit yourself (and other contributors if applicable) with"
68 | echo "the ':user:' directive."
69 | echo ""
70 | echo "If you see this error and there is already a changelog entry,"
71 | echo "check that the PR number is correct."
72 | echo ""
73 | echo "If you believe that this PR does not warrant a changelog"
74 | echo "entry, say so in a comment so that a maintainer will label"
75 | echo "the PR with 'No Changelog Needed' to bypass this check."
76 | exit 1
77 | fi
78 |
--------------------------------------------------------------------------------
/benchmarks_nonasv/bench_plot_urf.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from time import time
3 |
4 | import numpy as np
5 | from numpy import random as nr
6 |
7 | from treeple import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest
8 |
9 |
10 | def compute_bench(samples_range, features_range):
11 | it = 0
12 | results = defaultdict(lambda: [])
13 |
14 | est_params = {"min_samples_split": 5, "criterion": "fastbic", "n_jobs": None}
15 |
16 | max_it = len(samples_range) * len(features_range)
17 | for n_samples in samples_range:
18 | for n_features in features_range:
19 | it += 1
20 |
21 | print("==============================")
22 | print("Iteration %03d of %03d" % (it, max_it))
23 | print("==============================")
24 | print()
25 | print(f"n_samples: {n_samples} and n_features: {n_features}")
26 | data = nr.randint(-50, 51, (n_samples, n_features))
27 |
28 | print("Unsupervised RF")
29 | tstart = time()
30 | est = UnsupervisedRandomForest(**est_params).fit(data)
31 |
32 | delta = time() - tstart
33 | max_depth = max(tree.get_depth() for tree in est.estimators_)
34 | print("Speed: %0.3fs" % delta)
35 | print("Max depth: %d" % max_depth)
36 | print()
37 |
38 | results["unsup_rf_speed"].append(delta)
39 | results["unsup_rf_depth"].append(max_depth)
40 |
41 | print("Unsupervised Oblique RF")
42 | # let's prepare the data in small chunks
43 | est = UnsupervisedObliqueRandomForest(**est_params)
44 | tstart = time()
45 | est.fit(data)
46 | delta = time() - tstart
47 | max_depth = max(tree.get_depth() for tree in est.estimators_)
48 | print("Speed: %0.3fs" % delta)
49 | print("Max depth: %d" % max_depth)
50 | print()
51 | print()
52 |
53 | results["unsup_obliquerf_speed"].append(delta)
54 | results["unsup_obliquerf_depth"].append(max_depth)
55 |
56 | return results
57 |
58 |
59 | if __name__ == "__main__":
60 | import matplotlib.pyplot as plt
61 | from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection
62 |
63 | samples_range = np.linspace(50, 150, 5).astype(int)
64 | features_range = np.linspace(150, 50000, 5).astype(int)
65 | chunks = np.linspace(500, 10000, 15).astype(int)
66 |
67 | results = compute_bench(samples_range, features_range)
68 |
69 | max_time = max([max(i) for i in [t for (label, t) in results.items() if "speed" in label]])
70 | max_inertia = max(
71 | [max(i) for i in [t for (label, t) in results.items() if "speed" not in label]]
72 | )
73 |
74 | fig = plt.figure("scikit-learn Unsupervised (Oblique and Axis) RF benchmark results")
75 | for c, (label, timings) in zip("brcy", sorted(results.items())):
76 | if "speed" in label:
77 | ax = fig.add_subplot(2, 1, 1, projection="3d")
78 | ax.set_zlim3d(0.0, max_time * 1.1)
79 | else:
80 | ax = fig.add_subplot(2, 1, 2, projection="3d")
81 | ax.set_zlim3d(0.0, max_inertia * 1.1)
82 |
83 | X, Y = np.meshgrid(samples_range, features_range)
84 | Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0])
85 | ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5)
86 | ax.set_title(f"{label}")
87 | ax.set_xlabel("n_samples")
88 | ax.set_ylabel("n_features")
89 |
90 | plt.show()
91 |
--------------------------------------------------------------------------------
/examples/treeple/treeple_tutorial_1_1b_MI.py:
--------------------------------------------------------------------------------
1 | """
2 | ==============
3 | Calculating MI
4 | ==============
5 | """
6 |
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | import seaborn as sns
10 | from scipy.stats import entropy
11 |
12 | from treeple.datasets import make_trunk_classification
13 | from treeple.ensemble import HonestForestClassifier
14 | from treeple.stats import build_oob_forest
15 |
16 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
17 | PALETTE = sns.color_palette("Set1")
18 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
19 | sns.set_style("white", {"axes.edgecolor": "#dddddd"})
20 | # %%
21 | # MI
22 | # --
23 | #
24 | # Mutual Information (*MI*) measures the mutual dependence between *X* and
25 | # *Y*. It can be calculated by the difference between the class entropy
26 | # (``H(Y)``) and the conditional entropy (``H(Y | X)``):
27 | #
28 | # .. math:: I(X; Y) = H(Y) - H(Y\mid X)
29 | #
30 | # With a binary class simulation as an example, this tutorial will show
31 | # how to use ``treeple`` to calculate the statistic.
32 |
33 | # %%
34 | # Create a simulation with two gaussians
35 | # --------------------------------------
36 |
37 |
38 | # create a binary class simulation with two gaussians
39 | # 500 samples for each class, class zero is standard
40 | # gaussian, and class one has a mean at one
41 | X, y = make_trunk_classification(
42 | n_samples=1000,
43 | n_dim=1,
44 | mu_0=0,
45 | mu_1=1,
46 | n_informative=1,
47 | seed=1,
48 | )
49 |
50 |
51 | fig, ax = plt.subplots(figsize=(6, 6))
52 | fig.tight_layout()
53 | ax.tick_params(labelsize=15)
54 |
55 | # histogram plot the samples
56 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
57 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
58 | ax.set_xlabel("Variable One", fontsize=15)
59 | ax.set_ylabel("Likelihood", fontsize=15)
60 | plt.legend(frameon=False, fontsize=15)
61 | plt.show()
62 |
63 |
64 | # %%
65 | # Fit the model
66 | # -------------
67 |
68 |
69 | # initialize the forest with 100 trees
70 | est = HonestForestClassifier(
71 | n_estimators=100,
72 | max_samples=1.6,
73 | max_features=0.3,
74 | bootstrap=True,
75 | stratify=True,
76 | random_state=1,
77 | )
78 |
79 | # fit the model and obtain the tree posteriors
80 | _, observe_proba = build_oob_forest(est, X, y)
81 |
82 | # generate forest posteriors for the two classes
83 | observe_proba = np.nanmean(observe_proba, axis=0)
84 |
85 |
86 | fig, ax = plt.subplots(figsize=(6, 6))
87 | fig.tight_layout()
88 | ax.tick_params(labelsize=15)
89 |
90 | # histogram plot the posterior probabilities for class one
91 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
92 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
93 | ax.set_ylabel("# of Samples", fontsize=15)
94 | ax.set_xlabel("Class One Posterior", fontsize=15)
95 | plt.legend(frameon=False, fontsize=15)
96 | plt.show()
97 |
98 |
99 | # %%
100 | # Calculate the statistic
101 | # -----------------------
102 | def Calculate_MI(y_true, y_pred_proba):
103 | # calculate the conditional entropy
104 | H_YX = np.mean(entropy(y_pred_proba, base=np.exp(1), axis=1))
105 |
106 | # empirical count of each class (n_classes)
107 | _, counts = np.unique(y_true, return_counts=True)
108 | # calculate the entropy of labels
109 | H_Y = entropy(counts, base=np.exp(1))
110 | return H_Y - H_YX
111 |
112 |
113 | mi = Calculate_MI(y, observe_proba)
114 | print("MI =", round(mi, 2))
115 |
--------------------------------------------------------------------------------
/treeple/_lib/meson.build:
--------------------------------------------------------------------------------
1 | fs = import('fs')
2 | if not fs.exists('sklearn')
3 | error('Missing the `sklearn` fork submodule! Run `git submodule update --init` to fix this.')
4 | endif
5 |
6 | # install tree/ submodule
7 | tree_extension_metadata = {
8 | '_tree':
9 | {'sources': ['./sklearn/tree/' + '_tree.pyx'],
10 | 'override_options': ['cython_language=cpp', 'optimization=3']},
11 | '_partitioner':
12 | {'sources': ['./sklearn/tree/' + '_partitioner.pyx'],
13 | 'override_options': ['cython_language=cpp', 'optimization=3']},
14 | '_splitter':
15 | {'sources': ['./sklearn/tree/' + '_splitter.pyx'],
16 | 'override_options': ['cython_language=cpp', 'optimization=3']},
17 | '_criterion':
18 | {'sources': ['./sklearn/tree/' + '_criterion.pyx'],
19 | 'override_options': ['cython_language=cpp', 'optimization=3']},
20 | '_utils':
21 | {'sources': ['./sklearn/tree/' + '_utils.pyx'],
22 | 'override_options': ['cython_language=cpp', 'optimization=3']},
23 | }
24 |
25 |
26 | foreach ext_name, ext_dict : tree_extension_metadata
27 | py.extension_module(
28 | ext_name,
29 | ext_dict.get('sources'),
30 | dependencies: [np_dep],
31 | override_options : ext_dict.get('override_options', []),
32 | cython_args: cython_c_args,
33 | subdir: 'treeple/_lib/sklearn/tree/',
34 | install: true
35 | )
36 | endforeach
37 |
38 | python_sources = [
39 | './sklearn/tree/__init__.py',
40 | './sklearn/tree/_classes.py',
41 | './sklearn/tree/_export.py',
42 | './sklearn/tree/_reingold_tilford.py',
43 | ]
44 |
45 | py.install_sources(
46 | python_sources,
47 | subdir: 'treeple/_lib/sklearn/tree' # Folder relative to site-packages to install to
48 | )
49 |
50 | # install ensemble/ submodule
51 | python_sources = [
52 | '_forest.py',
53 | ]
54 | foreach py_source: python_sources
55 | py.install_sources(
56 | './sklearn/ensemble/' + py_source,
57 | subdir: 'treeple/_lib/sklearn/ensemble'
58 | )
59 | endforeach
60 |
61 | # TODO: Can remove if included in scikit-learn eventually
62 | # install tree/ submodule
63 | extensions = [
64 | '_quad_tree',
65 | ]
66 |
67 | foreach ext: extensions
68 | py.extension_module(
69 | ext,
70 | ['./sklearn/neighbors/' + ext + '.pyx'],
71 | c_args: c_args,
72 | dependencies: [np_dep],
73 | cython_args: cython_c_args,
74 | override_options : ['optimization=3', 'cython_language=cpp'],
75 | install: true,
76 | subdir: 'treeple/_lib/sklearn/neighbors/',
77 | )
78 | endforeach
79 |
80 | # install tree/ submodule
81 | extensions = [
82 | '_typedefs',
83 | '_random',
84 | ]
85 |
86 | foreach ext: extensions
87 | py.extension_module(ext,
88 | ['./sklearn/utils/' + ext + '.pyx'],
89 | c_args: c_args,
90 | dependencies: [np_dep],
91 | cython_args: cython_c_args,
92 | override_options : ['optimization=3', 'cython_language=cpp'],
93 | install: true,
94 | subdir: 'treeple/_lib/sklearn/utils/',
95 | )
96 | endforeach
97 |
98 |
99 | # python_sources = [
100 | # '__init__.py',
101 | # ]
102 |
103 | # py.install_sources(
104 | # python_sources,
105 | # subdir: 'treeple/_lib' # Folder relative to site-packages to install to
106 | # )
107 |
108 | # tempita = files('./sklearn/_build_utils/tempita.py')
109 |
110 | # # Copy all the .py files to the install dir, rather than using
111 | # # py.install_sources and needing to list them explicitely one by one
112 | # # install_subdir('sklearn', install_dir: py.get_install_dir())
113 | # install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib'))
114 |
115 | # subdir('sklearn')
116 |
--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_oblique_splitter.pxd:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from libcpp.vector cimport vector
4 |
5 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
6 | from ..._lib.sklearn.tree._tree cimport ParentInfo
7 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t
8 | from .._oblique_splitter cimport ObliqueSplitRecord
9 | from ._unsup_splitter cimport UnsupervisedSplitter
10 |
11 | # cdef struct ObliqueSplitRecord:
12 | # # Data to track sample split
13 | # intp_t feature # Which feature to split on.
14 | # intp_t pos # Split samples array at the given position,
15 | # # # i.e. count of samples below threshold for feature.
16 | # # # pos is >= end if the node is a leaf.
17 | # float64_t threshold # Threshold to split at.
18 | # float64_t improvement # Impurity improvement given parent node.
19 | # float64_t impurity_left # Impurity of the left split.
20 | # float64_t impurity_right # Impurity of the right split.
21 | # intp_t n_constant_features # Number of constant features in the split.
22 |
23 | # vector[float32_t]* proj_vec_weights # weights of the vector (max_features,)
24 | # vector[intp_t]* proj_vec_indices # indices of the features (max_features,)
25 |
26 |
27 | cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter):
28 | """
29 | Notable changes wrt scikit-learn:
30 | 1. `weighted_n_node_samples` is used as a stopping criterion and just used to
31 | keep count of the "number of samples (weighted)". All samples have a default weight
32 | of '1'.
33 | 2. `X` array instead of `y` array is stored as the criterions are computed over the X
34 | array.
35 | """
36 |
37 | # Oblique Splitting extra parameters
38 | cdef public float64_t feature_combinations # Number of features to combine
39 | cdef intp_t n_non_zeros # Number of non-zero features
40 | cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix
41 | cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix
42 | cdef intp_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features
43 |
44 | # All oblique splitters (i.e. non-axis aligned splitters) require a
45 | # function to sample a projection matrix that is applied to the feature matrix
46 | # to quickly obtain the sampled projections for candidate splits.
47 | cdef void sample_proj_mat(self,
48 | vector[vector[float32_t]]& proj_mat_weights,
49 | vector[vector[intp_t]]& proj_mat_indices) noexcept nogil
50 |
51 | # Redefined here since the new logic requires calling sample_proj_mat
52 | cdef int node_reset(self, intp_t start, intp_t end,
53 | float64_t* weighted_n_node_samples) except -1 nogil
54 |
55 | cdef int node_split(
56 | self,
57 | ParentInfo* parent,
58 | SplitRecord* split,
59 | ) except -1 nogil
60 | cdef int init(
61 | self,
62 | const float32_t[:, :] X,
63 | const float64_t[:] sample_weight
64 | ) except -1
65 | cdef intp_t pointer_size(self) noexcept nogil
66 |
67 | cdef void compute_features_over_samples(
68 | self,
69 | intp_t start,
70 | intp_t end,
71 | const intp_t[:] samples,
72 | float32_t[:] feature_values,
73 | vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,)
74 | vector[intp_t]* proj_vec_indices # indices of the features (max_features,)
75 | ) noexcept nogil
76 |
--------------------------------------------------------------------------------
/examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py:
--------------------------------------------------------------------------------
1 | """
2 | ==========================================================================================
3 | Plot oblique forest and axis-aligned random forest predictions on sparse parity simulation
4 | ==========================================================================================
5 | A performance comparison between oblique forest and standard axis-
6 | aligned random forest using sparse parity simulation dataset.
7 | Sparse parity is a variation of the noisy parity problem,
8 | which itself is a multivariate generalization of the noisy XOR problem.
9 | This is a binary classification task in high dimensions. The simulation
10 | will generate uniformly distributed `n_samples` number of sample points
11 | in the range of -1 and +1 with `p` number of features. `p*` is a
12 | parameter used to limit features that carry information about the class.
13 | The informative binary label is then defined as 1 if there are odd number
14 | of the sum of data `X` across first `p*` features that are greater than 0,
15 | otherwise the label is defined as 0. The simulation is further detailed
16 | in this [publication](https://epubs.siam.org/doi/epdf/10.1137/1.9781611974973.56).
17 | """
18 |
19 | from datetime import datetime
20 |
21 | import matplotlib.pyplot as plt
22 | import numpy as np
23 | import pandas as pd
24 | import seaborn as sns
25 | from sklearn.ensemble import RandomForestClassifier
26 | from sklearn.model_selection import RepeatedKFold, cross_validate
27 |
28 | from treeple import ObliqueRandomForestClassifier
29 |
30 | random_state = 123456
31 | t0 = datetime.now()
32 |
33 |
34 | def sparse_parity(n_samples, p=20, p_star=3, random_seed=None, **kwargs):
35 | if random_seed:
36 | np.random.seed(random_seed)
37 |
38 | X = np.random.uniform(-1, 1, (n_samples, p))
39 | y = np.zeros(n_samples)
40 |
41 | for i in range(0, n_samples):
42 | y[i] = sum(X[i, :p_star] > 0) % 2
43 |
44 | return X, y
45 |
46 |
47 | def get_scores(X, y, n_cv=5, n_repeats=1, random_state=1, kwargs=None):
48 | clfs = [
49 | RandomForestClassifier(**kwargs[0], random_state=random_state),
50 | ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state),
51 | ]
52 |
53 | tmp = []
54 |
55 | for i, clf in enumerate(clfs):
56 | cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=random_state)
57 | test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")
58 |
59 | tmp.append([["RF", "OF"][i], test_score["test_score"], test_score["test_score"].mean()])
60 |
61 | df = pd.DataFrame(tmp, columns=["model", "score", "mean"])
62 | df = df.explode("score")
63 | df["score"] = df["score"].astype(float)
64 | df.reset_index(inplace=True, drop=True)
65 |
66 | return df
67 |
68 |
69 | # Grid searched hyper-parameters
70 | params = [
71 | {"max_features": None, "n_estimators": 100, "max_depth": None},
72 | {"max_features": 40, "n_estimators": 100, "max_depth": 20},
73 | ]
74 |
75 | X, y = sparse_parity(n_samples=1000, random_seed=random_state)
76 |
77 | df = get_scores(X=X, y=y, n_cv=3, n_repeats=1, random_state=random_state, kwargs=params)
78 | t_d = (datetime.now() - t0).seconds
79 | print(f"It took {t_d} seconds to run the script")
80 |
81 | # Draw a comparison plot
82 | fig, ax = plt.subplots(1, 1, figsize=(6, 6))
83 |
84 | sns.stripplot(data=df, x="model", y="score", ax=ax, dodge=True)
85 | sns.boxplot(data=df, x="model", y="score", ax=ax, color="white")
86 | ax.set_title("Sparse Parity")
87 |
88 | rf = df.query('model=="RF"')["mean"].iloc[0]
89 | rff = f"RF (Mean Test Score: {round(rf,3)})"
90 |
91 | of = df.query('model=="OF"')["mean"].iloc[0]
92 | off = f"OF (Mean Test Score: {round(of,3)})"
93 |
94 | ax.legend([rff, off], loc=4)
95 |
96 | plt.savefig(f"plot_sim_{t_d}s.jpg")
97 | plt.show()
98 |
--------------------------------------------------------------------------------
/treeple/__init__.py:
--------------------------------------------------------------------------------
1 | """Scikit manifold oblique random forests."""
2 |
3 | import logging
4 | import os
5 | import sys
6 |
7 | __version__ = "0.10.3"
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
12 | # simultaneously. This can happen for instance when calling BLAS inside a
13 | # prange. Setting the following environment variable allows multiple OpenMP
14 | # libraries to be loaded. It should not degrade performances since we manually
15 | # take care of potential over-subcription performance issues, in sections of
16 | # the code where nested OpenMP loops can happen, by dynamically reconfiguring
17 | # the inner OpenMP runtime to temporarily disable it while under the scope of
18 | # the outer OpenMP parallel section.
19 | os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
20 |
21 | # Workaround issue discovered in intel-openmp 2019.5:
22 | # https://github.com/ContinuumIO/anaconda-issues/issues/11294
23 | os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
24 |
25 |
26 | try:
27 | # This variable is injected in the __builtins__ by the build
28 | # process. It is used to enable importing subpackages of sklearn when
29 | # the binaries are not built
30 | __treeple_SETUP__ # type: ignore
31 | except NameError:
32 | __treeple_SETUP__ = False
33 |
34 | if __treeple_SETUP__:
35 | sys.stderr.write("Running from treeple source directory.\n")
36 | sys.stderr.write("Partial import of treeple during the build process.\n")
37 | # We are not importing the rest of treeple during the build
38 | # process, as it may not be compiled yet
39 | else:
40 | try:
41 | from . import _lib, tree, ensemble, experimental, stats
42 | from ._lib.sklearn.ensemble._forest import (
43 | RandomForestClassifier,
44 | RandomForestRegressor,
45 | ExtraTreesClassifier,
46 | ExtraTreesRegressor,
47 | )
48 | from .neighbors import NearestNeighborsMetaEstimator
49 | from .ensemble import ExtendedIsolationForest, MultiViewRandomForestClassifier
50 | from .ensemble._unsupervised_forest import (
51 | UnsupervisedRandomForest,
52 | UnsupervisedObliqueRandomForest,
53 | )
54 | from .ensemble._supervised_forest import (
55 | ExtraObliqueRandomForestClassifier,
56 | ExtraObliqueRandomForestRegressor,
57 | ObliqueRandomForestClassifier,
58 | ObliqueRandomForestRegressor,
59 | PatchObliqueRandomForestClassifier,
60 | PatchObliqueRandomForestRegressor,
61 | )
62 | from .ensemble._honest_forest import HonestForestClassifier
63 | except ImportError as e:
64 | print(e.msg)
65 | msg = """Error importing treeple: you cannot import treeple while
66 | being in treeple source directory; please exit the treeple source
67 | tree first and relaunch your Python interpreter."""
68 | raise Exception(e)
69 | # raise ImportError(msg) from e
70 |
71 | __all__ = [
72 | "_lib",
73 | "tree",
74 | "experimental",
75 | "ensemble",
76 | "stats",
77 | "ExtraObliqueRandomForestClassifier",
78 | "ExtraObliqueRandomForestRegressor",
79 | "NearestNeighborsMetaEstimator",
80 | "ObliqueRandomForestClassifier",
81 | "ObliqueRandomForestRegressor",
82 | "PatchObliqueRandomForestClassifier",
83 | "PatchObliqueRandomForestRegressor",
84 | "UnsupervisedRandomForest",
85 | "UnsupervisedObliqueRandomForest",
86 | "HonestForestClassifier",
87 | "RandomForestClassifier",
88 | "RandomForestRegressor",
89 | "ExtraTreesClassifier",
90 | "ExtraTreesRegressor",
91 | "ExtendedIsolationForest",
92 | "MultiViewRandomForestClassifier",
93 | ]
94 |
--------------------------------------------------------------------------------
/examples/quantile_predictions/plot_quantile_vs_standard_oblique_forest.py:
--------------------------------------------------------------------------------
1 | """
2 | ==============================================================
3 | Quantile regression with oblique regression forest
4 | ==============================================================
5 |
6 | An example to generate quantile predictions using an oblique random forest
7 | instance on a synthetic, right-skewed dataset.
8 |
9 | This example was heavily inspired by `quantile-forest `_
10 | package. See their package `here `_.
11 | """
12 |
13 | from collections import defaultdict
14 |
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | import scipy as sp
18 | from sklearn.model_selection import train_test_split
19 | from sklearn.utils.validation import check_random_state
20 |
21 | from treeple.ensemble import ObliqueRandomForestRegressor
22 |
23 | rng = check_random_state(0)
24 |
25 | # %%
26 | # Generate the data
27 | # -----------------
28 | # We use a synthetic dataset with 2 features and 5000 samples. The target is
29 | # generated from a skewed normal distribution. (The mean of the distribution
30 | # is to the right of the median.)
31 |
32 | n_samples = 5000
33 | a, loc, scale = 5, -1, 1
34 | skewnorm_rv = sp.stats.skewnorm(a, loc, scale)
35 | skewnorm_rv.random_state = rng
36 | y = skewnorm_rv.rvs(n_samples)
37 | X = rng.randn(n_samples, 2) * y.reshape(-1, 1)
38 |
39 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
40 |
41 | regr_orf = ObliqueRandomForestRegressor(n_estimators=10, random_state=0)
42 |
43 | regr_orf.fit(X_train, y_train)
44 |
45 | y_pred_orf = regr_orf.predict(X_test)
46 | # %%
47 | # Generate Quantile Predictions
48 | # -----------------------------
49 | # The idea is for each prediction, the training samples that fell into the same leaf nodes
50 | # are collected then used to generate the quantile statistics for the desired prediction.
51 |
52 | # Get the leaf-nodes the training samples fall into
53 | leaf_ids = regr_orf.apply(X_train)
54 | # create a list of dictionary that maps node to samples that fell into it
55 | # for each tree
56 | node_to_indices = []
57 | for tree in range(leaf_ids.shape[1]):
58 | d = defaultdict(list)
59 | for id, leaf in enumerate(leaf_ids[:, tree]):
60 | d[leaf].append(id)
61 | node_to_indices.append(d)
62 | # drop the X_test to the trained tree and
63 | # get the indices of leaf nodes that fall into it
64 | leaf_ids_test = regr_orf.apply(X_test)
65 | # for each samples, collect the indices of the samples that fell into
66 | # the same leaf node for each tree
67 | y_pred_quantile = []
68 | for sample in range(leaf_ids_test.shape[0]):
69 | li = [
70 | node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
71 | ]
72 | # merge the list of indices into one
73 | idx = [item for sublist in li for item in sublist]
74 | # get the y_train for each corresponding id
75 | y_pred_quantile.append(y_train[idx])
76 | # get the quatile preditions for each predicted sample
77 | y_pred_quantile = [np.quantile(y_pred_quantile[i], 0.5) for i in range(len(y_pred_quantile))]
78 |
79 | # %%
80 | # Plot the results
81 | # ----------------
82 | # The plot shows the distribution of the actual target values and the predicted median
83 | # (i.e. 0.5 quantile), and the mean prediction by the regular random forest regressor.
84 | # In this skewed dataset, the median prediction using the quantile method works better at
85 | # predicting the off-centered target distribution than the regular mean prediction.
86 |
87 | colors = ["#c0c0c0", "#a6e5ff", "#e7a4f5"]
88 | names = ["Actual", "QRF (Median)", "ORF (Mean)"]
89 | plt.hist([y_test, y_pred_quantile, y_pred_orf], bins=50, color=colors, label=names)
90 | plt.xlabel("Actual and Predicted Target Values")
91 | plt.ylabel("Counts")
92 | plt.legend()
93 | plt.show()
94 |
--------------------------------------------------------------------------------
/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py:
--------------------------------------------------------------------------------
1 | """
2 | ======================================================
3 | Quantile prediction with Random Forest Regressor class
4 | ======================================================
5 |
6 | An example that demonstrates how to use the Random Forest to generate
7 | quantile predictions such as conditional median and prediction intervals.
8 | The example compares the predictions to a ground truth function used
9 | to generate noisy samples.
10 |
11 | This example was heavily inspired by `quantile-forest `_
12 | package. See their package `here `_.
13 | """
14 |
15 | from collections import defaultdict
16 |
17 | import matplotlib.pyplot as plt
18 | import numpy as np
19 | from sklearn.ensemble import RandomForestRegressor
20 | from sklearn.model_selection import train_test_split
21 |
22 | # %%
23 | # Generate the data
24 |
25 |
26 | def make_toy_dataset(n_samples, seed=0):
27 | rng = np.random.RandomState(seed)
28 |
29 | x = rng.uniform(0, 10, size=n_samples)
30 | f = x * np.sin(x)
31 |
32 | sigma = 0.25 + x / 10
33 | noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
34 | y = f + noise
35 |
36 | return np.atleast_2d(x).T, y
37 |
38 |
39 | n_samples = 1000
40 | X, y = make_toy_dataset(n_samples)
41 |
42 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
43 |
44 | xx = np.atleast_2d(np.linspace(0, 10, n_samples)).T
45 |
46 |
47 | # %%
48 | # Fit the model to the training samples
49 | # -------------------------------------
50 |
51 | rf = RandomForestRegressor(max_depth=3, random_state=0)
52 | rf.fit(X_train, y_train)
53 |
54 | y_pred = rf.predict(xx)
55 |
56 | # get the leaf nodes that each sample fell into
57 | leaf_ids = rf.apply(X_train)
58 | # create a list of dictionary that maps node to samples that fell into it
59 | # for each tree
60 | node_to_indices = []
61 | for tree in range(leaf_ids.shape[1]):
62 | d = defaultdict(list)
63 | for id, leaf in enumerate(leaf_ids[:, tree]):
64 | d[leaf].append(id)
65 | node_to_indices.append(d)
66 | # drop the X_test to the trained tree and
67 | # get the indices of leaf nodes that fall into it
68 | leaf_ids_test = rf.apply(xx)
69 | # for each samples, collect the indices of the samples that fell into
70 | # the same leaf node for each tree
71 | y_pred_quatile = []
72 | for sample in range(leaf_ids_test.shape[0]):
73 | li = [
74 | node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
75 | ]
76 | # merge the list of indices into one
77 | idx = [item for sublist in li for item in sublist]
78 | # get the y_train for each corresponding id
79 | y_pred_quatile.append(y_train[idx])
80 | # get the quatile preditions for each predicted sample
81 | y_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))]
82 | y_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))]
83 | y_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))]
84 |
85 | # %%
86 | # Plot the results
87 | # ----------------
88 | # Plot the conditional median and prediction intervals.
89 | # The blue line is the predicted median and the shaded area indicates the 95% confidence interval
90 | # of the prediction. The dots are the training data and the black line indicates the function that
91 | # is used to generated those samples.
92 |
93 | plt.plot(X_test, y_test, ".", c="#f2a619", label="Test Observations", ms=5)
94 | plt.plot(xx, (xx * np.sin(xx)), c="black", label="$f(x) = x\,\sin(x)$", lw=2)
95 | plt.plot(xx, y_pred_med, c="#006aff", label="Predicted Median", lw=3, ms=5)
96 | plt.fill_between(
97 | xx.ravel(),
98 | y_pred_low,
99 | y_pred_upp,
100 | color="#e0f2ff",
101 | label="Predicted 95% Interval",
102 | )
103 | plt.xlabel("$x$")
104 | plt.ylabel("$f(x)$")
105 | plt.legend(loc="upper left")
106 | plt.show()
107 |
--------------------------------------------------------------------------------
/doc/modules/ensemble.rst:
--------------------------------------------------------------------------------
1 | .. _oblique_forests:
2 |
3 | Oblique Random Forests
4 | ----------------------
5 |
6 | In oblique random forests (see :class:`~treeple.ObliqueRandomForestClassifier` and
7 | `ObliqueRandomForestRegressor` classes), each tree in the ensemble is built
8 | from a sample drawn with replacement (i.e., a bootstrap sample) from the
9 | training set. The oblique random forest is the same as that of a random forest,
10 | except in how the splits are computed in each tree.
11 |
12 | Similar to how random forests achieve a reduced variance by combining diverse trees,
13 | sometimes at the cost of a slight increase in bias, oblique random forests aim to do the same.
14 | They are motivated to construct even more diverse trees, thereby improving model generalization.
15 | In practice the variance reduction is often significant hence yielding an overall better model.
16 |
17 | In contrast to the original publication :footcite:`breiman2001random`, the scikit-learn
18 | implementation allows the user to control the number of features to combine in computing
19 | candidate splits. This is done via the ``feature_combinations`` parameter. For
20 | more information and intuition, see
21 | :ref:`documentation on oblique decision trees `.
22 |
23 | .. topic:: Examples:
24 |
25 | * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py`
26 | * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_axis_aligned_forests_sparse_parity.py`
27 |
28 | .. topic:: References
29 |
30 | .. footbibliography::
31 |
32 | .. _oblique_forest_feature_importance:
33 |
34 | Feature importance evaluation
35 | -----------------------------
36 |
37 | The relative rank (i.e. depth) of a feature used as a decision node in a
38 | tree can be used to assess the relative importance of that feature with
39 | respect to the predictability of the target variable. Features used at
40 | the top of the tree contribute to the final prediction decision of a
41 | larger fraction of the input samples. The **expected fraction of the
42 | samples** they contribute to can thus be used as an estimate of the
43 | **relative importance of the features**. In treeple, the fraction of
44 | samples a feature contributes to is combined with the decrease in impurity
45 | from splitting them to create a normalized estimate of the predictive power
46 | of that feature. This is essentially exactly the same it is done in scikit-learn.
47 |
48 | By **averaging** the estimates of predictive ability over several randomized
49 | trees one can **reduce the variance** of such an estimate and use it
50 | for feature selection. This is known as the mean decrease in impurity, or MDI.
51 | Refer to [L2014]_ for more information on MDI and feature importance
52 | evaluation with Random Forests. We implement the approach taken in :footcite:`Li2023manifold`
53 | and :footcite:`TomitaSPORF2020`.
54 |
55 | .. warning::
56 |
57 | The impurity-based feature importances computed on tree-based models suffer
58 | from two flaws that can lead to misleading conclusions. First they are
59 | computed on statistics derived from the training dataset and therefore **do
60 | not necessarily inform us on which features are most important to make good
61 | predictions on held-out dataset**. Secondly, **they favor high cardinality
62 | features**, that is features with many unique values.
63 | :ref:`sklearn:permutation_importance` is an alternative to impurity-based feature
64 | importance that does not suffer from these flaws. These two methods of
65 | obtaining feature importance are explored in:
66 | :ref:`sklearn:sphx_glr_auto_examples_inspection_plot_permutation_importance.py`.
67 |
68 | In practice those estimates are stored as an attribute named
69 | ``feature_importances_`` on the fitted model. This is an array with shape
70 | ``(n_features,)`` whose values are positive and sum to 1.0. The higher
71 | the value, the more important is the contribution of the matching feature
72 | to the prediction function.
73 |
74 | .. topic:: References
75 |
76 | .. footbibliography::
77 |
78 | .. [L2014] Louppe, G. :arxiv:`"Understanding Random Forests: From Theory to
79 | Practice" <1407.7502>`,
80 | PhD Thesis, U. of Liege, 2014.
81 |
--------------------------------------------------------------------------------
/treeple/tree/manifold/_morf_splitter.pxd:
--------------------------------------------------------------------------------
1 | # distutils: language = c++
2 |
3 | # Authors: Adam Li
4 | # Chester Huynh
5 | # Parth Vora
6 | #
7 | # License: BSD 3 clause
8 |
9 | # See _oblique_splitter.pyx for details.
10 |
11 | import numpy as np
12 |
13 | from libcpp.vector cimport vector
14 |
15 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
16 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t
17 | from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord
18 |
19 | # https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/algorithm.pxd
20 | # shows how to include standard library functions in Cython
21 | # This includes the discrete_distribution C++ class, which can be used
22 | # to generate samples from a discrete distribution with non-uniform probabilities.
23 | # cdef extern from "" namespace "std" nogil:
24 | # cdef cppclass discrete_distribution[T]
25 | # ctypedef T int_type
26 | # ctypedef G generator_type
27 | # discrete_distribution(T first, T last) except +
28 | # operator()(&G) except +
29 |
30 | cdef class PatchSplitter(BestObliqueSplitter):
31 | # The PatchSplitter creates candidate feature values by sampling 2D patches from
32 | # an input data vector. The input data is vectorized, so `data_height` and
33 | # `data_width` are used to determine the vectorized indices corresponding to
34 | # (x,y) coordinates in the original un-vectorized data.
35 | cdef public intp_t ndim # The number of dimensions of the input data
36 |
37 | cdef const intp_t[:] data_dims # The dimensions of the input data
38 | cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension
39 | cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension
40 | cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous
41 |
42 | # TODO: check if this works and is necessary for discontiguous data
43 | # cdef intp_t[:] stride_offsets # The stride offsets for each dimension
44 | cdef bint _discontiguous
45 |
46 | cdef bytes boundary # how to sample the patch with boundary in mind
47 | cdef const float32_t[:, :] feature_weight # Whether or not to normalize each column of X when adding in a patch
48 |
49 | cdef intp_t[::1] _index_data_buffer
50 | cdef intp_t[::1] _index_patch_buffer
51 | cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch
52 | cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point
53 |
54 | # All oblique splitters (i.e. non-axis aligned splitters) require a
55 | # function to sample a projection matrix that is applied to the feature matrix
56 | # to quickly obtain the sampled projections for candidate splits.
57 | cdef (intp_t, intp_t) sample_top_left_seed(
58 | self
59 | ) noexcept nogil
60 |
61 | cdef void sample_proj_mat(
62 | self,
63 | vector[vector[float32_t]]& proj_mat_weights,
64 | vector[vector[intp_t]]& proj_mat_indices
65 | ) noexcept nogil
66 |
67 |
68 | # cdef class UserKernelSplitter(PatchSplitter):
69 | # """A class to hold user-specified kernels."""
70 | # cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels
71 |
72 |
73 | cdef class GaussianKernelSplitter(PatchSplitter):
74 | """A class to hold Gaussian kernels.
75 |
76 | Overrides the weights that are generated to be sampled from a Gaussian distribution.
77 | See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus
78 | See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567
79 | """
80 |
81 | cdef void sample_proj_mat(
82 | self,
83 | vector[vector[float32_t]]& proj_mat_weights,
84 | vector[vector[intp_t]]& proj_mat_indices
85 | ) noexcept nogil
86 |
--------------------------------------------------------------------------------
/treeple/stats/tests/test_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from numpy.testing import assert_array_equal
4 |
5 | from treeple import HonestForestClassifier
6 | from treeple.stats import (
7 | PermutationHonestForestClassifier,
8 | build_cv_forest,
9 | build_permutation_forest,
10 | )
11 |
12 | seed = 12345
13 | rng = np.random.default_rng(seed)
14 |
15 |
16 | @pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)])
17 | def test_build_cv_honest_forest(bootstrap, max_samples):
18 | n_estimators = 100
19 | est = HonestForestClassifier(
20 | n_estimators=n_estimators,
21 | random_state=0,
22 | bootstrap=bootstrap,
23 | max_samples=max_samples,
24 | honest_fraction=0.5,
25 | stratify=True,
26 | )
27 | X = rng.normal(0, 1, (100, 2))
28 | X[:50] *= -1
29 | y = np.array([0, 1] * 50)
30 | samples = np.arange(len(y))
31 |
32 | est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest(
33 | est,
34 | X,
35 | y,
36 | return_indices=True,
37 | seed=seed,
38 | cv=3,
39 | )
40 |
41 | assert isinstance(est_list, list)
42 | assert isinstance(proba_list, list)
43 |
44 | for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list):
45 | assert len(train_idx) + len(test_idx) == len(samples)
46 | structure_samples = est.structure_indices_
47 | leaf_samples = est.honest_indices_
48 |
49 | if not bootstrap:
50 | oob_samples = [[] for _ in range(est.n_estimators)]
51 | else:
52 | oob_samples = est.oob_samples_
53 |
54 | # compared to oob samples, now the train samples are comprised of the entire dataset
55 | # seen over the entire forest. The test dataset is completely disjoint
56 | for tree_idx in range(est.n_estimators):
57 | n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx])
58 | assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), (
59 | f"For tree: "
60 | f"{tree_idx} {len(structure_samples[tree_idx])} + "
61 | f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} "
62 | f"!= {len(train_idx)} {len(test_idx)}"
63 | )
64 |
65 |
66 | def test_build_permutation_forest():
67 | """Simple test for building a permutation forest."""
68 | n_estimators = 30
69 | n_samples = 100
70 | n_features = 3
71 | rng = np.random.default_rng(seed)
72 |
73 | _X = rng.uniform(size=(n_samples, n_features))
74 | _X = rng.uniform(size=(n_samples // 2, n_features))
75 | X2 = _X + 10
76 | X = np.vstack([_X, X2])
77 | y = np.vstack(
78 | [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
79 | ) # Binary classification
80 |
81 | clf = HonestForestClassifier(
82 | n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True
83 | )
84 | perm_clf = PermutationHonestForestClassifier(
85 | n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True
86 | )
87 | with pytest.raises(
88 | RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier"
89 | ):
90 | build_permutation_forest(clf, clf, X, y, seed=seed)
91 |
92 | forest_result, orig_forest_proba, perm_forest_proba = build_permutation_forest(
93 | clf, perm_clf, X, y, metric="s@98", n_repeats=20, seed=seed
94 | )
95 | assert forest_result.observe_test_stat > 0.1, f"{forest_result.observe_stat}"
96 | assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}"
97 | assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape)
98 |
99 | X = np.vstack([_X, _X])
100 | forest_result, _, _ = build_permutation_forest(
101 | clf, perm_clf, X, y, metric="s@98", n_repeats=10, seed=seed
102 | )
103 | assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"
104 | assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}"
105 |
--------------------------------------------------------------------------------
/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py:
--------------------------------------------------------------------------------
1 | """
2 | ========================================================
3 | Predicting with different quantile interpolation methods
4 | ========================================================
5 |
6 | An example comparison of interpolation methods that can be applied during
7 | prediction when the desired quantile lies between two data points.
8 |
9 | This example was heavily inspired by `quantile-forest `_
10 | package. See their package `here `_.
11 | """
12 |
13 | from collections import defaultdict
14 |
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | from sklearn.ensemble import RandomForestRegressor
18 |
19 | # %%
20 | # Generate the data
21 | # -----------------
22 | # We use four simple data points to illustrate the difference between the intervals that are
23 | # generated using different interpolation methods.
24 |
25 | X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
26 | y = np.array([-2, -1, 0, 1, 2])
27 |
28 | # %%
29 | # The interpolation methods
30 | # -------------------------
31 | # The following interpolation methods demonstrated here are:
32 | # To interpolate between the data points, i and j (``i <= j``),
33 | # linear, lower, higher, midpoint, or nearest. For more details, see `treeple.RandomForestRegressor`.
34 | # The difference between the methods can be illustrated with the following example:
35 |
36 | interpolations = ["linear", "lower", "higher", "midpoint", "nearest"]
37 | colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"]
38 | quantiles = [0.025, 0.5, 0.975]
39 |
40 | y_medians = []
41 | y_errs = []
42 | est = RandomForestRegressor(
43 | n_estimators=1,
44 | random_state=0,
45 | )
46 | # fit the model
47 | est.fit(X, y)
48 | # get the leaf nodes that each sample fell into
49 | leaf_ids = est.apply(X)
50 | # create a list of dictionary that maps node to samples that fell into it
51 | # for each tree
52 | node_to_indices = []
53 | for tree in range(leaf_ids.shape[1]):
54 | d = defaultdict(list)
55 | for id, leaf in enumerate(leaf_ids[:, tree]):
56 | d[leaf].append(id)
57 | node_to_indices.append(d)
58 | # drop the X_test to the trained tree and
59 | # get the indices of leaf nodes that fall into it
60 | leaf_ids_test = est.apply(X)
61 | # for each samples, collect the indices of the samples that fell into
62 | # the same leaf node for each tree
63 | y_pred_quantile = []
64 | for sample in range(leaf_ids_test.shape[0]):
65 | li = [
66 | node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
67 | ]
68 | # merge the list of indices into one
69 | idx = [item for sublist in li for item in sublist]
70 | # get the y_train for each corresponding id``
71 | y_pred_quantile.append(y[idx])
72 |
73 | for interpolation in interpolations:
74 | # get the quatile preditions for each predicted sample
75 | y_pred = [
76 | np.array(
77 | [
78 | np.quantile(y_pred_quantile[i], quantile, method=interpolation)
79 | for i in range(len(y_pred_quantile))
80 | ]
81 | )
82 | for quantile in quantiles
83 | ]
84 | y_medians.append(y_pred[1])
85 | y_errs.append(
86 | np.concatenate(
87 | (
88 | [y_pred[1] - y_pred[0]],
89 | [y_pred[2] - y_pred[1]],
90 | ),
91 | axis=0,
92 | )
93 | )
94 |
95 | sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10)
96 | ebs = []
97 | for i, (median, y_err) in enumerate(zip(y_medians, y_errs)):
98 | ebs.append(
99 | plt.errorbar(
100 | np.arange(len(y)) + (0.15 * (i + 1)) - 0.35,
101 | median,
102 | yerr=y_err,
103 | color=colors[i],
104 | ecolor=colors[i],
105 | fmt="o",
106 | )
107 | )
108 | plt.xlim([-0.75, len(y) - 0.25])
109 | plt.xticks(np.arange(len(y)), X.tolist())
110 | plt.xlabel("Samples (Feature Values)")
111 | plt.ylabel("Actual and Predicted Values")
112 | plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2)
113 | plt.show()
114 |
--------------------------------------------------------------------------------
/treeple/tests/test_extensions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from numpy.testing import assert_array_equal
4 | from sklearn.datasets import make_classification
5 |
6 | from treeple import (
7 | ExtraObliqueRandomForestClassifier,
8 | ExtraObliqueRandomForestRegressor,
9 | HonestForestClassifier,
10 | ObliqueRandomForestClassifier,
11 | ObliqueRandomForestRegressor,
12 | PatchObliqueRandomForestClassifier,
13 | PatchObliqueRandomForestRegressor,
14 | )
15 |
16 |
17 | @pytest.mark.parametrize("n_classes", [2, 3])
18 | @pytest.mark.parametrize(
19 | "Forest",
20 | [
21 | HonestForestClassifier,
22 | ExtraObliqueRandomForestClassifier,
23 | ObliqueRandomForestClassifier,
24 | PatchObliqueRandomForestClassifier,
25 | ],
26 | )
27 | def test_predict_proba_per_tree(Forest, n_classes):
28 | # Assuming forest_model is an instance of a forest model with ForestMixin
29 | # You may need to adjust the actual implementation according to your specific model
30 | X, y = make_classification(
31 | n_samples=100, n_features=50, n_informative=20, n_classes=n_classes, random_state=0
32 | )
33 |
34 | # Call the method being tested
35 | if Forest == HonestForestClassifier:
36 | est = Forest(n_estimators=10, bootstrap=True, random_state=0, honest_prior="empirical")
37 | else:
38 | est = Forest(n_estimators=10, bootstrap=True, random_state=0)
39 | est.fit(X, y)
40 | proba_per_tree = est.predict_proba_per_tree(X)
41 |
42 | # Perform assertions to check the correctness of the output
43 | assert proba_per_tree.shape[0] == est.n_estimators
44 | assert proba_per_tree.shape[1] == X.shape[0]
45 | assert proba_per_tree.shape[2] == est.n_classes_
46 | assert not np.isnan(proba_per_tree).any()
47 |
48 | proba_per_tree = est.predict_proba_per_tree(X, est.oob_samples_)
49 | # Perform assertions to check the correctness of the output
50 | assert proba_per_tree.shape[0] == est.n_estimators
51 | assert proba_per_tree.shape[1] == X.shape[0]
52 | assert proba_per_tree.shape[2] == est.n_classes_
53 | assert np.isnan(proba_per_tree).any()
54 |
55 |
56 | @pytest.mark.parametrize(
57 | "Forest",
58 | [
59 | HonestForestClassifier,
60 | ExtraObliqueRandomForestClassifier,
61 | ObliqueRandomForestClassifier,
62 | PatchObliqueRandomForestClassifier,
63 | ObliqueRandomForestRegressor,
64 | PatchObliqueRandomForestRegressor,
65 | ExtraObliqueRandomForestRegressor,
66 | ],
67 | )
68 | @pytest.mark.parametrize("bootstrap", [True, False])
69 | @pytest.mark.parametrize("random_state", [None, 0])
70 | def test_forest_has_deterministic_sampling_for_oob_structure_and_leaves(
71 | Forest, bootstrap, random_state
72 | ):
73 | """Test that forest models can produce the oob and inbag samples deterministically.
74 |
75 | When bootstrap is True, oob should be exclusive from in bag samples.
76 | When bootstrap is False, there is no oob.
77 | """
78 | rng = np.random.default_rng(0)
79 |
80 | n_estimators = 5
81 | est = Forest(
82 | n_estimators=n_estimators,
83 | random_state=random_state,
84 | bootstrap=bootstrap,
85 | )
86 | X = rng.normal(0, 1, (100, 2))
87 | X[:50] *= -1
88 | y = [0, 1] * 50
89 | samples = np.arange(len(y))
90 |
91 | est.fit(X, y)
92 |
93 | inbag_samples = est.estimators_samples_
94 | oob_samples = [
95 | [idx for idx in samples if idx not in inbag_samples[jdx]] for jdx in range(n_estimators)
96 | ]
97 | if not bootstrap:
98 | assert all(oob_list_ == [] for oob_list_ in oob_samples)
99 |
100 | with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
101 | est.oob_samples_
102 | else:
103 | oob_samples_ = est.oob_samples_
104 | for itree in range(n_estimators):
105 | assert len(oob_samples[itree]) > 1, oob_samples[itree]
106 | assert set(inbag_samples[itree]).intersection(set(oob_samples_[itree])) == set()
107 | assert set(inbag_samples[itree]).union(set(oob_samples_[itree])) == set(samples)
108 | assert_array_equal(oob_samples_[itree], oob_samples[itree])
109 |
--------------------------------------------------------------------------------
/treeple/stats/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 |
4 | import numpy as np
5 | import pytest
6 | import scipy.sparse as sp
7 | from numpy.testing import assert_array_equal
8 |
9 | import treeple.stats.utils as utils
10 | from treeple import HonestForestClassifier
11 | from treeple.stats.utils import get_per_tree_oob_samples
12 |
13 | seed = 1234
14 | rng = np.random.default_rng(seed)
15 |
16 |
17 | @pytest.mark.parametrize("bootstrap", [True, False])
18 | def test_get_per_tree_oob_samples(bootstrap):
19 | n_estimators = 5
20 | est = HonestForestClassifier(n_estimators=n_estimators, random_state=0, bootstrap=bootstrap)
21 |
22 | X = rng.normal(0, 1, (100, 2))
23 | X[:50] *= -1
24 | y = [0, 1] * 50
25 | samples = np.arange(len(y))
26 | est.fit(X, y)
27 |
28 | if bootstrap:
29 | inbag_samples = est.estimators_samples_
30 | oob_samples = [
31 | [idx for idx in samples if idx not in inbag_samples[jdx]] for jdx in range(n_estimators)
32 | ]
33 | oob_samples_ = get_per_tree_oob_samples(est)
34 | for itree in range(n_estimators):
35 | assert len(oob_samples[itree]) > 1
36 | assert_array_equal(oob_samples_[itree], oob_samples[itree])
37 | else:
38 | with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
39 | get_per_tree_oob_samples(est)
40 |
41 |
42 | @pytest.mark.parametrize("use_bottleneck", [True, False])
43 | def test_non_nan_samples(use_bottleneck: bool):
44 | if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
45 | del os.environ[utils.DISABLE_BN_ENV_VAR]
46 | importlib.reload(utils)
47 | else:
48 | os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
49 | importlib.reload(utils)
50 |
51 | posterior_array = np.array(
52 | [
53 | # tree 1
54 | [
55 | [0, 1],
56 | [np.nan, np.nan],
57 | [np.nan, np.nan],
58 | ],
59 | # tree 2
60 | [
61 | [0, 1],
62 | [np.nan, np.nan],
63 | [1, 0],
64 | ],
65 | ]
66 | ) # [2, 3, 2]
67 |
68 | expected = np.array([0, 2])
69 | actual = utils._non_nan_samples(posterior_array)
70 | np.testing.assert_array_equal(expected, actual)
71 |
72 |
73 | @pytest.mark.parametrize("use_bottleneck", [True, False])
74 | def test_nanmean_f(use_bottleneck: bool):
75 | if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
76 | del os.environ[utils.DISABLE_BN_ENV_VAR]
77 | importlib.reload(utils)
78 | else:
79 | os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
80 | importlib.reload(utils)
81 |
82 | posterior_array = np.array(
83 | [
84 | [1, 2, np.nan],
85 | [3, 4, np.nan],
86 | ]
87 | )
88 |
89 | expected = np.array([1.5, 3.5])
90 | actual = utils.nanmean_f(posterior_array, axis=1)
91 | np.testing.assert_array_equal(expected, actual)
92 |
93 |
94 | @pytest.mark.parametrize(
95 | ("forest_indices", "expected"),
96 | [
97 | (np.arange(3), np.array([0.375, 0.75, 0.25])),
98 | (np.arange(3) + 2, np.array([0.10, 0.05, 0.25])),
99 | (np.arange(3) + 3, np.array([0.10, 0.45, np.nan])),
100 | ],
101 | )
102 | def test_get_forest_preds_sparse(
103 | forest_indices,
104 | expected,
105 | ):
106 |
107 | all_y_pred = sp.csc_matrix(
108 | np.array(
109 | [
110 | [0.50, 0.00, 0.00],
111 | [0.25, 0.75, 0.00],
112 | [0.00, 0.00, 0.25],
113 | [0.10, 0.00, 0.00],
114 | [0.00, 0.05, 0.00],
115 | [0.00, 0.85, 0.00],
116 | ]
117 | )
118 | )
119 |
120 | all_y_indicator = sp.csc_matrix(
121 | np.array(
122 | [
123 | [1, 0, 0],
124 | [1, 1, 0],
125 | [0, 0, 1],
126 | [1, 0, 0],
127 | [0, 1, 0],
128 | [0, 1, 0],
129 | ]
130 | )
131 | )
132 |
133 | np.testing.assert_array_equal(
134 | utils._get_forest_preds_sparse(all_y_pred, all_y_indicator, forest_indices),
135 | expected,
136 | )
137 |
--------------------------------------------------------------------------------
/treeple/tests/test_unsupervised_forest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from sklearn import datasets
4 | from sklearn.cluster import AgglomerativeClustering
5 | from sklearn.datasets import make_blobs
6 | from sklearn.metrics import adjusted_rand_score
7 | from sklearn.utils.estimator_checks import parametrize_with_checks
8 |
9 | from treeple.ensemble import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest
10 |
11 | CLUSTER_CRITERIONS = ("twomeans", "fastbic")
12 |
13 | FOREST_CLUSTERS = {
14 | "UnsupervisedRandomForest": UnsupervisedRandomForest,
15 | "UnsupervisedObliqueRandomForest": UnsupervisedObliqueRandomForest,
16 | }
17 |
18 | # load iris dataset
19 | iris = datasets.load_iris()
20 | rng = np.random.RandomState(1)
21 | perm = rng.permutation(iris.target.size)
22 | iris.data = iris.data[perm]
23 | iris.target = iris.target[perm]
24 |
25 |
26 | @parametrize_with_checks(
27 | [
28 | UnsupervisedRandomForest(random_state=12345, n_estimators=50),
29 | UnsupervisedObliqueRandomForest(random_state=12345, n_estimators=50),
30 | ]
31 | )
32 | def test_sklearn_compatible_estimator(estimator, check):
33 | if check.func.__name__ in [
34 | # Cannot apply agglomerative clustering on < 2 samples
35 | "check_methods_subset_invariance",
36 | # sample weights do not necessarily imply a sample is not used in clustering
37 | "check_sample_weight_equivalence",
38 | "check_sample_weight_equivalence_on_dense_data",
39 | "check_sample_weight_equivalence_on_sparse_data",
40 | # sample order is not preserved in predict
41 | "check_methods_sample_order_invariance",
42 | ]:
43 | pytest.skip()
44 | check(estimator)
45 |
46 |
47 | @pytest.mark.parametrize("name, forest", FOREST_CLUSTERS.items())
48 | @pytest.mark.parametrize("criterion", CLUSTER_CRITERIONS)
49 | def test_check_simulation(name, forest, criterion):
50 | n_samples = 200
51 | n_classes = 2
52 |
53 | #
54 | if name == "UnsupervisedRandomForest":
55 | n_features = 5
56 | if criterion == "twomeans":
57 | expected_score = 0.05
58 | elif criterion == "fastbic":
59 | expected_score = 0.35
60 | else:
61 | n_features = 20
62 |
63 | # in the forest setting, we can overfit the training dataset perfectly
64 | expected_score = 1.0
65 | X, y = make_blobs(
66 | n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=12345
67 | )
68 |
69 | clf = forest(criterion=criterion, random_state=12345)
70 | clf.fit(X)
71 | sim_mat = clf.compute_similarity_matrix(X)
72 |
73 | # all ones along the diagonal
74 | assert np.array_equal(sim_mat.diagonal(), np.ones(n_samples))
75 |
76 | cluster = AgglomerativeClustering(n_clusters=n_classes).fit(sim_mat)
77 | predict_labels = cluster.fit_predict(sim_mat)
78 | score = adjusted_rand_score(y, predict_labels)
79 |
80 | # XXX: This should be > 0.9 according to the UReRF. However, that could be because they used
81 | # the oblique projections by default
82 | assert (
83 | score >= expected_score
84 | ), f"{name}-blobs failed with criterion {criterion} and score = {score}"
85 |
86 |
87 | @pytest.mark.parametrize("name, forest", FOREST_CLUSTERS.items())
88 | @pytest.mark.parametrize("criterion", CLUSTER_CRITERIONS)
89 | def test_check_iris(name, forest, criterion):
90 | # Check consistency on dataset iris.
91 | n_classes = 3
92 | est = forest(criterion=criterion, random_state=12345)
93 | est.fit(iris.data, iris.target)
94 | sim_mat = est.compute_similarity_matrix(iris.data)
95 |
96 | if criterion == "twomeans":
97 | if "oblique" in name.lower():
98 | expected_score = 0.21
99 | else:
100 | expected_score = 0.2
101 | elif criterion == "fastbic":
102 | if "oblique" in name.lower():
103 | expected_score = 0.55
104 | else:
105 | expected_score = 0.3
106 |
107 | cluster = AgglomerativeClustering(n_clusters=n_classes).fit(sim_mat)
108 | predict_labels = cluster.fit_predict(sim_mat)
109 | score = adjusted_rand_score(iris.target, predict_labels)
110 |
111 | # Two-means and fastBIC criterions perform similarly here
112 | assert (
113 | score > expected_score
114 | ), f"{name}-iris failed with criterion {criterion} and score = {score}"
115 |
--------------------------------------------------------------------------------
/treeple/meson.build:
--------------------------------------------------------------------------------
1 | # Platform detection
2 | is_windows = host_machine.system() == 'windows'
3 | is_mingw = is_windows and cc.get_id() == 'gcc'
4 |
5 | c_args = []
6 | cython_c_args = []
7 | if is_windows
8 | # For mingw-w64, link statically against the UCRT.
9 | gcc_link_args = ['-lucrt', '-static']
10 | if is_mingw
11 | add_project_link_arguments(gcc_link_args, language: ['c', 'cpp'])
12 | # Force gcc to float64 long doubles for compatibility with MSVC
13 | # builds, for C only.
14 | add_project_arguments('-mlong-double-64', language: 'c')
15 | # Make fprintf("%zd") work (see https://github.com/rgommers/scipy/issues/118)
16 | add_project_arguments('-D__USE_MINGW_ANSI_STDIO=1', language: ['c', 'cpp'])
17 | # Manual add of MS_WIN64 macro when not using MSVC.
18 | # https://bugs.python.org/issue28267
19 | bitness = run_command(
20 | '_build_utils/gcc_build_bitness.py',
21 | check: true
22 | ).stdout().strip()
23 | if bitness == '64'
24 | add_project_arguments('-DMS_WIN64', language: ['c', 'cpp'])
25 | endif
26 | # Silence warnings emitted by PyOS_snprintf for (%zd), see
27 | # https://github.com/rgommers/scipy/issues/118.
28 | # Use as c_args for extensions containing Cython code
29 | c_args += ['-Wno-format-extra-args', '-Wno-format']
30 | endif
31 | endif
32 |
33 | openmp_dep = dependency('OpenMP', language: 'c', required: false)
34 |
35 | if not openmp_dep.found()
36 | warning(
37 | '''
38 | ***********
39 | * WARNING *
40 | ***********
41 |
42 | It seems that treeple cannot be built with OpenMP.
43 |
44 | - Make sure you have followed the installation instructions:
45 |
46 | https://scikit-learn.org/dev/developers/advanced_installation.html
47 |
48 | - If your compiler supports OpenMP but you still see this
49 | message, please submit a bug report at:
50 |
51 | https://github.com/treeple/treeple/issues
52 |
53 | - The build will continue with OpenMP-based parallelism
54 | disabled. Note however that some estimators will run in
55 | sequential mode instead of leveraging thread-based
56 | parallelism.
57 |
58 | ***
59 | ''')
60 | endif
61 |
62 | # NumPy include directory - needed in all submodules
63 | incdir_numpy = meson.get_external_property('numpy-include-dir', 'not-given')
64 | if incdir_numpy == 'not-given'
65 | incdir_numpy = run_command(py,
66 | [
67 | '-c',
68 | '''
69 | import os
70 | import numpy as np
71 | try:
72 | incdir = os.path.relpath(np.get_include())
73 | except Exception:
74 | incdir = np.get_include()
75 | print(incdir)
76 | '''
77 | ],
78 | check: true
79 | ).stdout().strip()
80 | endif
81 |
82 | inc_np = include_directories(incdir_numpy)
83 | # Don't use the deprecated NumPy C API. Define this to a fixed version instead of
84 | # NPY_API_VERSION in order not to break compilation for released SciPy versions
85 | # when NumPy introduces a new deprecation.
86 | numpy_no_deprecated_api = ['-DNPY_NO_DEPRECATED_API=NPY_1_9_API_VERSION']
87 | np_dep = declare_dependency(include_directories: inc_np, compile_args: numpy_no_deprecated_api)
88 |
89 | cc = meson.get_compiler('c')
90 |
91 | # Don't use the deprecated NumPy C API. Define this to a fixed version instead of
92 | # NPY_API_VERSION in order not to break compilation for released versions
93 | # when NumPy introduces a new deprecation. Use in a meson.build file::
94 | #
95 | # py.extension_module('_name',
96 | # 'source_fname',
97 | # numpy_nodepr_api)
98 |
99 | # TODO XXX: ENABLE WHEN DEBUGGING
100 | boundscheck = 'False'
101 |
102 | scikit_learn_cython_args = [
103 | '-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False',
104 | '-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True',
105 | '-X profile=False',
106 | '-X embedsignature=True',
107 | # Needed for cython imports across subpackages, e.g. cluster pyx that
108 | # cimports metrics pxd
109 | '--include-dir', meson.global_build_root(),
110 | ]
111 | cython_c_args += scikit_learn_cython_args
112 |
113 | python_sources = [
114 | '__init__.py',
115 | 'neighbors.py',
116 | 'conftest.py',
117 | ]
118 |
119 | py.install_sources(
120 | python_sources,
121 | subdir: 'treeple'
122 | )
123 |
124 | subdir('_lib')
125 | subdir('ensemble')
126 | subdir('experimental')
127 | subdir('stats')
128 | subdir('tests')
129 | subdir('tree')
130 | subdir('datasets')
131 |
--------------------------------------------------------------------------------
/treeple/experimental/tests/test_sdf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | from sklearn import datasets
4 | from sklearn.metrics import accuracy_score, r2_score
5 | from sklearn.utils.estimator_checks import parametrize_with_checks
6 |
7 | from treeple.experimental import StreamDecisionForest
8 |
9 | CLF_CRITERIONS = ("gini", "entropy")
10 |
11 | # also load the iris dataset
12 | # and randomly permute it
13 | iris = datasets.load_iris()
14 | rng = np.random.RandomState(1)
15 | perm = rng.permutation(iris.target.size)
16 | iris.data = iris.data[perm]
17 | iris.target = iris.target[perm]
18 |
19 |
20 | def test_toy_accuracy():
21 | clf = StreamDecisionForest(n_estimators=10)
22 | X = np.ones((20, 4))
23 | X[10:] *= -1
24 | y = [0] * 10 + [1] * 10
25 | clf = clf.fit(X, y)
26 | np.testing.assert_array_equal(clf.predict(X), y)
27 |
28 |
29 | def test_first_fit():
30 | clf = StreamDecisionForest(n_estimators=10)
31 | with pytest.raises(
32 | ValueError, match="classes must be passed on the first call to partial_fit."
33 | ):
34 | clf.partial_fit(iris.data, iris.target)
35 |
36 |
37 | @pytest.mark.parametrize("criterion", ["gini", "entropy"])
38 | @pytest.mark.parametrize("max_features", [None, 2])
39 | def test_iris(criterion, max_features):
40 | # Check consistency on dataset iris.
41 | clf = StreamDecisionForest(
42 | criterion=criterion,
43 | random_state=0,
44 | max_features=max_features,
45 | n_estimators=10,
46 | )
47 |
48 | clf.partial_fit(iris.data, iris.target, classes=np.unique(iris.target))
49 | score = accuracy_score(clf.predict(iris.data), iris.target)
50 |
51 | assert score > 0.5 and score <= 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
52 | "SDF", criterion, score
53 | )
54 |
55 | score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
56 | assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
57 | "SDF", criterion, score
58 | )
59 |
60 | clf.partial_fit(iris.data, iris.target)
61 | score = accuracy_score(clf.predict(iris.data), iris.target)
62 |
63 | assert (
64 | score > 0.5 and score <= 1.0
65 | ), "Failed partial_fit with {0}, criterion = {1} and score = {2}".format(
66 | "SDF", criterion, score
67 | )
68 |
69 | score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
70 | assert score == 1.0, "Failed partial_fit with {0}, criterion = {1} and score = {2}".format(
71 | "SDF", criterion, score
72 | )
73 |
74 |
75 | @pytest.mark.parametrize("criterion", ["gini", "entropy"])
76 | @pytest.mark.parametrize("max_features", [None, 2])
77 | def test_iris_multi(criterion, max_features):
78 | # Check consistency on dataset iris.
79 | clf = StreamDecisionForest(
80 | criterion=criterion,
81 | random_state=0,
82 | max_features=max_features,
83 | n_estimators=10,
84 | )
85 |
86 | second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)])
87 |
88 | X = iris.data
89 | y = np.stack((iris.target, second_y[perm])).T
90 |
91 | clf.fit(X, y)
92 | score = r2_score(clf.predict(X), y)
93 | assert score > 0.9 and score <= 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
94 | "SDF", criterion, score
95 | )
96 |
97 |
98 | def test_max_samples():
99 | max_samples_list = [8, 0.5, None]
100 | depths = []
101 | X = rng.normal(0, 1, (100, 2))
102 | X[:50] *= -1
103 | y = [0, 1] * 50
104 | for ms in max_samples_list:
105 | uf = StreamDecisionForest(n_estimators=2, random_state=0, max_samples=ms, bootstrap=True)
106 | uf = uf.fit(X, y)
107 | depths.append(uf.estimators_[0].get_depth())
108 |
109 | assert all(np.diff(depths) > 0)
110 |
111 |
112 | @parametrize_with_checks([StreamDecisionForest(n_estimators=10, random_state=0)])
113 | def test_sklearn_compatible_estimator(estimator, check):
114 | # 1. check_class_weight_classifiers is not supported since it requires sample weight
115 | # XXX: can include this "generalization" in the future if it's useful
116 | if check.func.__name__ in [
117 | "check_class_weight_classifiers",
118 | "check_sample_weight_equivalence",
119 | "check_sample_weight_equivalence_on_dense_data",
120 | "check_sample_weight_equivalence_on_sparse_data",
121 | ]:
122 | pytest.skip()
123 | check(estimator)
124 |
--------------------------------------------------------------------------------
/examples/sparse_oblique_trees/plot_oblique_random_forest.py:
--------------------------------------------------------------------------------
1 | """
2 | ===============================================================================
3 | Plot oblique forest and axis-aligned random forest predictions on cc18 datasets
4 | ===============================================================================
5 |
6 | A performance comparison between oblique forest and standard axis-
7 | aligned random forest using three datasets from OpenML benchmarking suites.
8 |
9 | Two of these datasets, namely
10 | [WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)
11 | and [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534)
12 | datasets consist of 31 features where the former dataset is entirely numeric
13 | and the latter dataset is entirely norminal. The third dataset, dubbed
14 | [cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a
15 | numeric dataset that has notably large feature space of 857 features. As you
16 | will notice, of these three datasets, the oblique forest outperforms axis-aligned
17 | random forest on cnae-9 utilizing sparse random projection mechanism. All datasets
18 | are subsampled due to computational constraints.
19 |
20 | For an example of using extra-oblique trees/forests in practice on data, see the following
21 | example :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_oblique_random_forest.py`.
22 | """
23 |
24 | from datetime import datetime
25 |
26 | import matplotlib.pyplot as plt
27 | import pandas as pd
28 | import seaborn as sns
29 | from sklearn.datasets import fetch_openml
30 | from sklearn.ensemble import RandomForestClassifier
31 | from sklearn.model_selection import RepeatedKFold, cross_validate
32 |
33 | from treeple import ObliqueRandomForestClassifier
34 |
35 | random_state = 123456
36 | t0 = datetime.now()
37 | data_ids = [4534, 1510, 1468] # openml dataset id
38 | df = pd.DataFrame()
39 |
40 |
41 | def load_cc18(data_id):
42 | df = fetch_openml(data_id=data_id, as_frame=True, parser="pandas")
43 |
44 | # extract the dataset name
45 | d_name = df.details["name"]
46 |
47 | # Subsampling large datasets
48 | if data_id == 1468:
49 | n = 100
50 | else:
51 | n = int(df.frame.shape[0] * 0.8)
52 |
53 | df = df.frame.sample(n, random_state=random_state)
54 | X, y = df.iloc[:, :-1], df.iloc[:, -1]
55 |
56 | return X, y, d_name
57 |
58 |
59 | def get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs):
60 | clfs = [RandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)]
61 |
62 | tmp = []
63 |
64 | for i, clf in enumerate(clfs):
65 | cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs["random_state"])
66 | test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")
67 |
68 | tmp.append(
69 | [
70 | d_name,
71 | ["RF", "OF"][i],
72 | test_score["test_score"],
73 | test_score["test_score"].mean(),
74 | ]
75 | )
76 |
77 | df = pd.DataFrame(
78 | tmp, columns=["dataset", "model", "score", "mean"]
79 | ) # dtype=[('model',object), ('score',float), ('mean',float)])
80 | df = df.explode("score")
81 | df["score"] = df["score"].astype(float)
82 | df.reset_index(inplace=True, drop=True)
83 |
84 | return df
85 |
86 |
87 | params = {
88 | "max_features": None,
89 | "n_estimators": 50,
90 | "max_depth": None,
91 | "random_state": random_state,
92 | "n_cv": 2,
93 | "n_repeats": 1,
94 | }
95 |
96 | for data_id in data_ids:
97 | X, y, d_name = load_cc18(data_id=data_id)
98 | print(f"Loading [{d_name}] dataset..")
99 | tmp = get_scores(X=X, y=y, d_name=d_name, **params)
100 | df = pd.concat([df, tmp])
101 |
102 | print(f"It took {(datetime.now()-t0).seconds} seconds to run the script")
103 |
104 | # Draw a comparison plot
105 | d_names = df.dataset.unique()
106 | N = d_names.shape[0]
107 |
108 | fig, ax = plt.subplots(1, N)
109 | fig.set_size_inches(6 * N, 6)
110 |
111 | for i, name in enumerate(d_names):
112 | sns.stripplot(
113 | data=df.query(f'dataset == "{name}"'),
114 | x="model",
115 | y="score",
116 | ax=ax[i],
117 | dodge=True,
118 | )
119 | sns.boxplot(
120 | data=df.query(f'dataset == "{name}"'),
121 | x="model",
122 | y="score",
123 | ax=ax[i],
124 | color="white",
125 | )
126 | ax[i].set_title(name)
127 | if i != 0:
128 | ax[i].set_ylabel("")
129 | ax[i].set_xlabel("")
130 |
--------------------------------------------------------------------------------
/examples/treeple/treeple_tutorial_1_1a_SA98.py:
--------------------------------------------------------------------------------
1 | """
2 | ================
3 | Calculating S@98
4 | ================
5 | """
6 |
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | import seaborn as sns
10 | from sklearn.metrics import roc_curve
11 |
12 | from treeple.datasets import make_trunk_classification
13 | from treeple.ensemble import HonestForestClassifier
14 | from treeple.stats import build_oob_forest
15 |
16 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
17 | PALETTE = sns.color_palette("Set1")
18 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
19 | sns.set_style("white", {"axes.edgecolor": "#dddddd"})
20 |
21 | # %%
22 | # S@98
23 | # ----
24 | #
25 | # Sensitivity at 98% specificity (*S@98*) measures, namely, the true
26 | # positive rate (*TPR*) when the false positive rate (*FPR*) is at 98%.
27 | #
28 | # .. math:: S@r = \mathbb{P}[\eta(X) > T_r \mid Y=1]
29 | #
30 | # With a binary class simulation as an example, this tutorial will show
31 | # how to use ``treeple`` to calculate the statistic.
32 |
33 | # %%
34 | # Create a simulation with two gaussians
35 | # --------------------------------------
36 |
37 |
38 | # create a binary class simulation with two gaussians
39 | # 500 samples for each class, class zero is standard
40 | # gaussian, and class one has a mean at one
41 | X, y = make_trunk_classification(
42 | n_samples=1000,
43 | n_dim=1,
44 | mu_0=0,
45 | mu_1=1,
46 | n_informative=1,
47 | seed=1,
48 | )
49 |
50 |
51 | fig, ax = plt.subplots(figsize=(6, 6))
52 | fig.tight_layout()
53 | ax.tick_params(labelsize=15)
54 |
55 | # histogram plot the samples
56 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
57 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
58 | ax.set_xlabel("Variable One", fontsize=15)
59 | ax.set_ylabel("Likelihood", fontsize=15)
60 | plt.legend(frameon=False, fontsize=15)
61 | plt.show()
62 |
63 | # %%
64 | # Fit the model
65 | # -------------
66 |
67 |
68 | # initialize the forest with 100 trees
69 | est = HonestForestClassifier(
70 | n_estimators=100,
71 | max_samples=1.6,
72 | max_features=0.3,
73 | bootstrap=True,
74 | stratify=True,
75 | random_state=1,
76 | )
77 |
78 | # fit the model and obtain the tree posteriors
79 | _, observe_proba = build_oob_forest(est, X, y)
80 |
81 | # generate forest posteriors for the two classes
82 | observe_proba = np.nanmean(observe_proba, axis=0)
83 |
84 |
85 | fig, ax = plt.subplots(figsize=(6, 6))
86 | fig.tight_layout()
87 | ax.tick_params(labelsize=15)
88 |
89 | # histogram plot the posterior probabilities for class one
90 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
91 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
92 | ax.set_ylabel("# of Samples", fontsize=15)
93 | ax.set_xlabel("Class One Posterior", fontsize=15)
94 | plt.legend(frameon=False, fontsize=15)
95 | plt.show()
96 |
97 | # %%
98 | # Calculate the statistic
99 | # -----------------------
100 |
101 |
102 | def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float:
103 | """Calculate the sensitivity at a specific specificity"""
104 | # check the shape of true labels
105 | if y_true.squeeze().ndim != 1:
106 | raise ValueError(f"y_true must be 1d, not {y_true.shape}")
107 |
108 | # find the positive class and calculate fpr and tpr
109 | if 0 in y_true or -1 in y_true:
110 | fpr, tpr, thresholds = roc_curve(
111 | y_true, y_pred_proba[:, 1], pos_label=1, drop_intermediate=False
112 | )
113 | else:
114 | fpr, tpr, thresholds = roc_curve(
115 | y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
116 | )
117 | sa98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr])
118 |
119 | fig, ax = plt.subplots(figsize=(6, 6))
120 | fig.tight_layout()
121 | ax.tick_params(labelsize=15)
122 | ax.set_xlim([-0.005, 1.005])
123 | ax.set_ylim([-0.005, 1.005])
124 | ax.set_xlabel("False Positive Rate", fontsize=15)
125 | ax.set_ylabel("True Positive Rate", fontsize=15)
126 |
127 | ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1])
128 |
129 | spec = int((1 - max_fpr) * 100)
130 | ax.axvline(
131 | x=max_fpr,
132 | color=PALETTE[0],
133 | ymin=0,
134 | ymax=sa98,
135 | label="S@" + str(spec) + " = " + str(round(sa98, 2)),
136 | linestyle="--",
137 | )
138 | ax.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--")
139 | ax.legend(frameon=False, fontsize=15)
140 |
141 | return sa98
142 |
143 |
144 | sa98 = Calculate_SA(y, observe_proba, max_fpr=0.02)
145 | print("S@98 =", round(sa98, 2))
146 | # sphinx_gallery_thumbnail_number = -1
147 |
--------------------------------------------------------------------------------
/examples/calibration/plot_honest_tree.py:
--------------------------------------------------------------------------------
1 | """
2 | ===========================================
3 | Comparison of Decision Tree and Honest Tree
4 | ===========================================
5 |
6 | This example compares the :class:`treeple.tree.HonestTreeClassifier` from the
7 | ``treeple`` library with the :class:`sklearn.tree.DecisionTreeClassifier`
8 | from scikit-learn on the Iris dataset.
9 |
10 | Both classifiers are fitted on the same dataset and their decision trees
11 | are plotted side by side.
12 | """
13 |
14 | import matplotlib.pyplot as plt
15 | from sklearn import config_context
16 | from sklearn.datasets import load_iris
17 | from sklearn.model_selection import train_test_split
18 | from sklearn.tree import DecisionTreeClassifier, plot_tree
19 |
20 | from treeple.tree import HonestTreeClassifier
21 |
22 | # Load the iris dataset
23 | iris = load_iris()
24 | X, y = iris.data, iris.target
25 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=0)
26 |
27 | # Initialize classifiers
28 | max_features = 0.3
29 |
30 | dishonest_clf = HonestTreeClassifier(
31 | honest_method=None,
32 | max_features=max_features,
33 | random_state=0,
34 | honest_prior="ignore",
35 | )
36 | honest_noprune_clf = HonestTreeClassifier(
37 | honest_method="apply",
38 | max_features=max_features,
39 | random_state=0,
40 | honest_prior="ignore",
41 | )
42 | honest_clf = HonestTreeClassifier(honest_method="prune", max_features=max_features, random_state=0)
43 | sklearn_clf = DecisionTreeClassifier(max_features=max_features, random_state=0)
44 |
45 | # Fit classifiers
46 | dishonest_clf.fit(X_train, y_train)
47 | honest_noprune_clf.fit(X_train, y_train)
48 | honest_clf.fit(X_train, y_train)
49 | sklearn_clf.fit(X_train, y_train)
50 |
51 | # Plotting the trees
52 | fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5))
53 |
54 | # .. note:: We skip parameter validation because internally the `plot_tree`
55 | # function checks if the estimator is a DecisionTreeClassifier
56 | # instance from scikit-learn, but the ``HonestTreeClassifier`` is
57 | # a subclass of a forked version of the DecisionTreeClassifier.
58 |
59 | # Plot HonestTreeClassifier tree
60 | ax = axes[2]
61 | with config_context(skip_parameter_validation=True):
62 | plot_tree(honest_clf, filled=True, ax=ax)
63 | ax.set_title("HonestTreeClassifier")
64 |
65 | # Plot HonestTreeClassifier tree
66 | ax = axes[1]
67 | with config_context(skip_parameter_validation=True):
68 | plot_tree(honest_noprune_clf, filled=False, ax=ax)
69 | ax.set_title("HonestTreeClassifier (No pruning)")
70 |
71 | # Plot HonestTreeClassifier tree
72 | ax = axes[0]
73 | with config_context(skip_parameter_validation=True):
74 | plot_tree(dishonest_clf, filled=False, ax=ax)
75 | ax.set_title("HonestTreeClassifier (Dishonest)")
76 |
77 |
78 | # Plot scikit-learn DecisionTreeClassifier tree
79 | plot_tree(sklearn_clf, filled=True, ax=axes[3])
80 | axes[3].set_title("DecisionTreeClassifier")
81 |
82 | plt.show()
83 |
84 | # %%
85 | # Discussion
86 | # ----------
87 | # The HonestTreeClassifier is a variant of the DecisionTreeClassifier that
88 | # provides honest inference. The honest inference is achieved by splitting the
89 | # dataset into two parts: the training set and the validation set. The training
90 | # set is used to build the tree, while the validation set is used to fit the
91 | # leaf nodes for posterior prediction. This results in calibrated posteriors
92 | # (see :ref:`sphx_glr_auto_examples_calibration_plot_overlapping_gaussians.py`).
93 | #
94 | # Compared to the ``honest_prior='apply'`` method, the ``honest_prior='prune'``
95 | # method builds a tree that will not contain empty leaves, and also leverages
96 | # the validation set to check split conditions. Thus we see that the pruned
97 | # honest tree is significantly smaller than the regular decision tree.
98 |
99 | # %%
100 | # Evaluate predictions of the trees
101 | # ---------------------------------
102 | # When we do not prune, note that the honest tree will have empty leaves
103 | # that predict the prior. In this case, ``honest_prior='ignore'`` is used
104 | # to ignore these leaves when computing the posteriors, which will result
105 | # in a posterior that is ``np.nan``.
106 |
107 | # this is the same as a decision tree classifier that is trained on less data
108 | print("\nDishonest posteriors: ", dishonest_clf.predict_proba(X_val))
109 |
110 | # this is the honest tree with empty leaves that predict the prior
111 | print("\nHonest tree without pruning: ", honest_noprune_clf.predict_proba(X_val))
112 |
113 | # this is the honest tree that is pruned
114 | print("\nHonest tree with pruning: ", honest_clf.predict_proba(X_val))
115 |
116 | # this is a regular decision tree classifier from sklearn
117 | print("\nDTC: ", sklearn_clf.predict_proba(X_val))
118 |
--------------------------------------------------------------------------------