├── treeple ├── _lib │ ├── __init__.py │ └── meson.build ├── tests │ ├── __init__.py │ ├── meson.build │ ├── test_neighbors.py │ ├── test_extensions.py │ └── test_unsupervised_forest.py ├── stats │ ├── tests │ │ ├── __init__.py │ │ ├── meson.build │ │ ├── test_permuteforest.py │ │ ├── test_baseline.py │ │ └── test_utils.py │ ├── meson.build │ └── __init__.py ├── tree │ ├── honesty │ │ ├── __init__.py │ │ ├── meson.build │ │ └── _honest_prune.pxd │ ├── manifold │ │ ├── __init__.py │ │ ├── meson.build │ │ └── _morf_splitter.pxd │ ├── tests │ │ ├── __init__.py │ │ ├── meson.build │ │ └── test_honest_prune.py │ ├── unsupervised │ │ ├── __init__.py │ │ ├── meson.build │ │ ├── _unsup_oblique_tree.pxd │ │ ├── _unsup_splitter.pxd │ │ ├── _unsup_criterion.pxd │ │ ├── _unsup_tree.pxd │ │ └── _unsup_oblique_splitter.pxd │ ├── _sklearn_splitter.pxd │ ├── _marginal.pxd │ ├── kernels.py │ ├── _utils.pxd │ ├── __init__.py │ ├── meson.build │ ├── _oblique_tree.pxd │ └── _neighbors.py ├── datasets │ ├── tests │ │ ├── __init__.py │ │ └── meson.build │ ├── meson.build │ └── __init__.py ├── experimental │ ├── distributions.py │ ├── tests │ │ ├── __init__.py │ │ ├── meson.build │ │ ├── test_simulate.py │ │ ├── test_mutual_info.py │ │ └── test_sdf.py │ ├── meson.build │ └── __init__.py ├── conftest.py ├── ensemble │ ├── meson.build │ └── __init__.py ├── _build_utils │ └── gcc_build_bitness.py ├── __init__.py └── meson.build ├── .codespellignore ├── benchmarks ├── __init__.py ├── utils.py ├── config.json └── ensemble_supervised.py ├── doc ├── sphinxext │ ├── MANIFEST.in │ ├── doi_role.py │ ├── allow_nan_estimators.py │ └── github_link.py ├── _templates │ ├── autosummary │ │ ├── function.rst │ │ └── class.rst │ └── layout.html ├── use.rst ├── user_guide.rst ├── whats_new │ ├── changelog_legend.inc │ ├── v0.10.rst │ ├── v0.9.rst │ ├── v0.5.rst │ ├── _contributors.rst │ ├── v0.3.rst │ ├── v0.8.rst │ ├── v0.4.rst │ ├── v0.2.rst │ ├── v0.1.rst │ ├── v0.6.rst │ └── v0.7.rst ├── whats_new.rst ├── make.bat ├── _static │ ├── style.css │ └── versions.json ├── install.rst ├── index.rst └── modules │ ├── unsupervised_tree.rst │ └── ensemble.rst ├── examples ├── README.txt ├── outlier_detection │ └── README.txt ├── splitters │ └── README.txt ├── multiview │ └── README.txt ├── calibration │ ├── README.txt │ └── plot_honest_tree.py ├── quantile_predictions │ ├── README.txt │ ├── plot_quantile_vs_standard_oblique_forest.py │ ├── plot_quantile_toy_example_with_RF.py │ └── plot_quantile_interpolation_with_RF.py ├── sklearn_vs_treeple │ ├── README.txt │ └── plot_iris_dtc.py ├── sparse_oblique_trees │ ├── README.txt │ ├── plot_oblique_axis_aligned_forests_sparse_parity.py │ └── plot_oblique_random_forest.py └── treeple │ ├── README.txt │ ├── treeple_tutorial_1_1d_HD.py │ ├── treeple_tutorial_1_1b_MI.py │ └── treeple_tutorial_1_1a_SA98.py ├── .github ├── ISSUE_TEMPLATE │ ├── blank.md │ ├── feature_request.md │ └── bug_report.md ├── dependabot.yml ├── label-globs.yml ├── workflows │ ├── pull_request_labeler.yml │ ├── cffconvert.yml │ ├── circle_artifacts.yml │ ├── style.yml │ ├── release.yml │ └── pr_checks.yml ├── FUNDING.yml └── PULL_REQUEST_TEMPLATE.md ├── test_requirements.txt ├── benchmarks_nonasv ├── README.md └── bench_plot_urf.py ├── .gitmodules ├── style_requirements.txt ├── .yamllint.yml ├── spin ├── .flake8 ├── .gitignore ├── CITATION.cff ├── .pre-commit-config.yaml ├── meson.build └── Makefile /treeple/_lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/stats/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/tree/honesty/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/tree/manifold/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/tree/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/datasets/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/experimental/distributions.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/experimental/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /treeple/tree/unsupervised/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.codespellignore: -------------------------------------------------------------------------------- 1 | raison 2 | nd 3 | parth 4 | ot 5 | fpr 6 | master -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | """Benchmark suite for treeple using ASV""" 2 | -------------------------------------------------------------------------------- /doc/sphinxext/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests *.py 2 | include *.txt 3 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | Examples demonstrating how to use treeple algorithms. 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank issue 3 | about: Create an issue without a template. 4 | 5 | --- 6 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | pandas 3 | pytest 4 | pytest-cov 5 | memory_profiler 6 | flaky 7 | tqdm 8 | bottleneck 9 | -------------------------------------------------------------------------------- /benchmarks_nonasv/README.md: -------------------------------------------------------------------------------- 1 | A set of scripts that can be run to analyze runtime and performance of treeple 2 | estimators. 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "treeple/_lib/sklearn"] 2 | path = treeple/_lib/sklearn_fork 3 | url = https://github.com/neurodata/scikit-learn 4 | branch = submodulev3 5 | -------------------------------------------------------------------------------- /examples/outlier_detection/README.txt: -------------------------------------------------------------------------------- 1 | .. _outlier_examples: 2 | 3 | Outlier-detection 4 | ----------------- 5 | 6 | Examples concerning how to do outlier detection with decision trees. 7 | -------------------------------------------------------------------------------- /examples/splitters/README.txt: -------------------------------------------------------------------------------- 1 | .. _splitter_examples: 2 | 3 | Decision-tree splitters 4 | ----------------------- 5 | 6 | Examples demonstrating different node-splitting strategies for decision trees. 7 | -------------------------------------------------------------------------------- /style_requirements.txt: -------------------------------------------------------------------------------- 1 | mypy 2 | black 3 | isort 4 | flake8 5 | bandit 6 | pydocstyle 7 | codespell 8 | toml 9 | cython-lint 10 | pre-commit 11 | yamllint 12 | toml-sort 13 | ruff 14 | rstcheck 15 | -------------------------------------------------------------------------------- /examples/multiview/README.txt: -------------------------------------------------------------------------------- 1 | .. _multiview_examples: 2 | 3 | Multi-view learning with Decision-trees 4 | --------------------------------------- 5 | 6 | Examples demonstrating multi-view learning using random forest variants. 7 | -------------------------------------------------------------------------------- /examples/calibration/README.txt: -------------------------------------------------------------------------------- 1 | .. _calibration_examples: 2 | 3 | Calibrated decision trees via honesty 4 | ------------------------------------- 5 | 6 | Examples demonstrating the usage of honest decision trees to obtain calibrated predictions. 7 | -------------------------------------------------------------------------------- /examples/quantile_predictions/README.txt: -------------------------------------------------------------------------------- 1 | .. _quantile_examples: 2 | 3 | Quantile Predictions with Random Forest 4 | --------------------------------------- 5 | 6 | Examples demonstrating how to generate quantile predictions using Random Forest variants. -------------------------------------------------------------------------------- /examples/sklearn_vs_treeple/README.txt: -------------------------------------------------------------------------------- 1 | .. _sklearn_examples: 2 | 3 | Comparing sklearn and treeple decision trees 4 | -------------------------------------------- 5 | 6 | Examples demonstrating the difference between sklearn and treeple decision trees. 7 | -------------------------------------------------------------------------------- /examples/sparse_oblique_trees/README.txt: -------------------------------------------------------------------------------- 1 | .. _sporf_examples: 2 | 3 | Sparse oblique projections with oblique decision-trees 4 | ------------------------------------------------------ 5 | 6 | Examples demonstrating learning using oblique random forests. 7 | -------------------------------------------------------------------------------- /treeple/datasets/tests/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'test_hyppo.py', 4 | 'test_multiview.py', 5 | ] 6 | 7 | py.install_sources( 8 | python_sources, 9 | pure: false, 10 | subdir: 'treeple/datasets/tests' 11 | ) 12 | -------------------------------------------------------------------------------- /treeple/datasets/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'multiview.py', 4 | 'hyppo.py', 5 | ] 6 | 7 | py.install_sources( 8 | python_sources, 9 | pure: false, 10 | subdir: 'treeple/datasets' 11 | ) 12 | 13 | subdir('tests') 14 | -------------------------------------------------------------------------------- /doc/_templates/autosummary/function.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autofunction:: {{ objname }} 6 | 7 | .. _sphx_glr_backreferences_{{ fullname }}: 8 | 9 | .. minigallery:: {{ fullname }} 10 | :add-heading: 11 | -------------------------------------------------------------------------------- /treeple/stats/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'forest.py', 4 | 'utils.py', 5 | 'permuteforest.py', 6 | 'baseline.py', 7 | ] 8 | 9 | py.install_sources( 10 | python_sources, 11 | pure: false, 12 | subdir: 'treeple/stats' 13 | ) 14 | 15 | subdir('tests') 16 | -------------------------------------------------------------------------------- /.yamllint.yml: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | ignore: | 4 | treeple/_lib/ 5 | .asv/ 6 | 7 | rules: 8 | line-length: disable 9 | document-start: disable 10 | truthy: disable 11 | comments: disable 12 | braces: 13 | forbid: false 14 | min-spaces-inside: 0 15 | max-spaces-inside: 1 16 | -------------------------------------------------------------------------------- /treeple/experimental/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'mutual_info.py', 4 | 'simulate.py', 5 | 'sdf.py', 6 | 'monte_carlo.py', 7 | ] 8 | 9 | py.install_sources( 10 | python_sources, 11 | pure: false, 12 | subdir: 'treeple/experimental' 13 | ) 14 | 15 | subdir('tests') 16 | -------------------------------------------------------------------------------- /treeple/experimental/tests/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'test_mutual_info.py', 4 | 'test_simulate.py', 5 | 'test_sdf.py', 6 | 'test_monte_carlo.py', 7 | ] 8 | 9 | py.install_sources( 10 | python_sources, 11 | pure: false, 12 | subdir: 'treeple/experimental/tests' 13 | ) 14 | -------------------------------------------------------------------------------- /doc/use.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | Examples using treeple 4 | ========================== 5 | 6 | To be able to effectively use treeple, look at some of the examples here 7 | to learn everything you need! 8 | 9 | .. rstcheck: ignore-next-code-block 10 | .. include:: auto_examples/index.rst 11 | :start-after: :orphan: 12 | -------------------------------------------------------------------------------- /treeple/stats/tests/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'test_forest.py', 4 | 'test_baseline.py', 5 | 'test_coleman.py', 6 | 'test_utils.py', 7 | 'test_permuteforest.py', 8 | ] 9 | 10 | py.install_sources( 11 | python_sources, 12 | pure: false, 13 | subdir: 'treeple/stats/tests' 14 | ) 15 | -------------------------------------------------------------------------------- /treeple/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # With the following global module marker, 4 | # monitoring is disabled by default: 5 | pytestmark = [pytest.mark.monitor_skip_test] 6 | 7 | 8 | def pytest_configure(config): 9 | """Set up pytest markers.""" 10 | config.addinivalue_line("markers", "slowtest: mark test as slow") 11 | -------------------------------------------------------------------------------- /examples/treeple/README.txt: -------------------------------------------------------------------------------- 1 | .. _treeple: 2 | 3 | Treeple for Hypothesis Testing 4 | ------------------------------ 5 | 6 | Examples concerning how to use treeple as hypothesis test tools. 7 | Tutorials include estimating true statistics with true posterior functionss, 8 | using forest to calculate statistic estimates, and calculating p-values. 9 | -------------------------------------------------------------------------------- /treeple/ensemble/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | '_supervised_forest.py', 4 | '_unsupervised_forest.py', 5 | '_honest_forest.py', 6 | '_eiforest.py', 7 | '_multiview.py', 8 | '_extensions.py', 9 | ] 10 | 11 | py.install_sources( 12 | python_sources, 13 | pure: false, 14 | subdir: 'treeple/ensemble' 15 | ) 16 | -------------------------------------------------------------------------------- /.github/label-globs.yml: -------------------------------------------------------------------------------- 1 | Cython: 2 | - treeple/**/*.pyx.* 3 | - treeple/**/*.pxd.* 4 | - treeple/**/*.pxi.* 5 | 6 | C/C++: 7 | - treeple/**/*.c 8 | - treeple/**/*.c.in 9 | - treeple/**/*.c.old 10 | - treeple/**/*.h 11 | - treeple/**/*.h.in 12 | - treeple/**/*.cpp 13 | - treeple/**/*.cc 14 | - treeple/**/*.cxx 15 | - treeple/**/*.hpp 16 | -------------------------------------------------------------------------------- /doc/user_guide.rst: -------------------------------------------------------------------------------- 1 | .. Places parent toc into the sidebar 2 | 3 | :parenttoc: True 4 | 5 | .. title:: User guide: contents 6 | 7 | .. _user_guide: 8 | 9 | ========== 10 | User Guide 11 | ========== 12 | 13 | .. toctree:: 14 | :numbered: 15 | :maxdepth: 3 16 | 17 | modules/supervised_tree 18 | modules/unsupervised_tree 19 | modules/ensemble 20 | -------------------------------------------------------------------------------- /treeple/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .hyppo import ( 2 | approximate_clf_mutual_information, 3 | approximate_clf_mutual_information_with_monte_carlo, 4 | make_marron_wand_classification, 5 | make_quadratic_classification, 6 | make_trunk_classification, 7 | make_trunk_mixture_classification, 8 | ) 9 | from .multiview import make_gaussian_mixture, make_joint_factor_model 10 | -------------------------------------------------------------------------------- /treeple/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | from . import mutual_info, sdf, simulate 2 | from .monte_carlo import conditional_resample 3 | from .mutual_info import ( 4 | cmi_from_entropy, 5 | cmi_gaussian, 6 | entropy_gaussian, 7 | entropy_weibull, 8 | mi_from_entropy, 9 | mi_gamma, 10 | mi_gaussian, 11 | mutual_info_ksg, 12 | ) 13 | from .sdf import StreamDecisionForest 14 | -------------------------------------------------------------------------------- /treeple/tree/_sklearn_splitter.pxd: -------------------------------------------------------------------------------- 1 | from .._lib.sklearn.utils._typedefs cimport float32_t, int32_t, intp_t 2 | 3 | # This defines c-importable functions for other cython files 4 | 5 | # TODO: remove these files when sklearn merges refactor defining these in pxd files 6 | # https://github.com/scikit-learn/scikit-learn/pull/25606 7 | cdef void sort(float32_t* Xf, intp_t* samples, intp_t n) noexcept nogil 8 | -------------------------------------------------------------------------------- /treeple/tests/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'test_supervised_forest.py', 4 | 'test_unsupervised_forest.py', 5 | 'test_neighbors.py', 6 | 'test_honest_forest.py', 7 | 'test_eiforest.py', 8 | 'test_multiview_forest.py', 9 | 'test_extensions.py', 10 | ] 11 | 12 | py.install_sources( 13 | python_sources, 14 | pure: false, 15 | subdir: 'treeple/tests' 16 | ) 17 | -------------------------------------------------------------------------------- /treeple/tree/tests/meson.build: -------------------------------------------------------------------------------- 1 | python_sources = [ 2 | '__init__.py', 3 | 'test_tree.py', 4 | 'test_utils.py', 5 | 'test_honest_tree.py', 6 | 'test_honest_prune.py', 7 | 'test_marginal.py', 8 | 'test_all_trees.py', 9 | 'test_unsupervised_tree.py', 10 | 'test_multiview.py', 11 | ] 12 | 13 | py.install_sources( 14 | python_sources, 15 | pure: false, 16 | subdir: 'treeple/tree/tests' 17 | ) -------------------------------------------------------------------------------- /treeple/stats/__init__.py: -------------------------------------------------------------------------------- 1 | from .baseline import build_cv_forest, build_permutation_forest 2 | from .forest import build_coleman_forest, build_oob_forest 3 | from .permuteforest import PermutationHonestForestClassifier 4 | 5 | __all__ = [ 6 | "build_cv_forest", 7 | "build_oob_forest", 8 | "build_coleman_forest", 9 | "build_permutation_forest", 10 | "PermutationHonestForestClassifier", 11 | ] 12 | -------------------------------------------------------------------------------- /treeple/tree/_marginal.pxd: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport numpy as cnp 4 | 5 | from .._lib.sklearn.tree._tree cimport BaseTree, Node 6 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t 7 | 8 | 9 | cpdef apply_marginal_tree( 10 | BaseTree tree, 11 | object X, 12 | const intp_t[:] marginal_indices, 13 | intp_t traversal_method, 14 | uint8_t use_sample_weight, 15 | object random_state 16 | ) 17 | -------------------------------------------------------------------------------- /treeple/tree/kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def gaussian_kernel(shape, sigma=1.0, mu=0.0): 5 | """N-dimensional gaussian kernel for the given shape. 6 | 7 | See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e 8 | """ 9 | m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape]) 10 | d = np.sqrt(np.sum([x * x for x in m], axis=0)) 11 | g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) 12 | return g / np.sum(g) 13 | -------------------------------------------------------------------------------- /spin: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Example stub for running `python -m spin` 4 | # 5 | # Copy this into your project root. 6 | 7 | import os 8 | import runpy 9 | import sys 10 | 11 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0]))) 12 | try: 13 | runpy.run_module("spin", run_name="__main__") 14 | except ImportError: 15 | print("Cannot import spin; please install it using") 16 | print() 17 | print(" pip install spin") 18 | print() 19 | sys.exit(1) 20 | -------------------------------------------------------------------------------- /doc/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. 2 | The empty line below should not be removed. It is added such that the `rst_prolog` 3 | is added before the :mod: directive. Otherwise, the rendering will show as a 4 | paragraph instead of a header. 5 | 6 | :mod:`{{module}}`.{{objname}} 7 | {{ underline }}============== 8 | 9 | .. currentmodule:: {{ module }} 10 | 11 | .. autoclass:: {{ objname }} 12 | 13 | .. _sphx_glr_backreferences_{{ fullname }}: 14 | 15 | .. raw:: html 16 | 17 |
18 | -------------------------------------------------------------------------------- /.github/workflows/pull_request_labeler.yml: -------------------------------------------------------------------------------- 1 | name: "Pull Request Labeler" 2 | on: 3 | pull_request_target: 4 | types: [created] 5 | 6 | permissions: 7 | contents: write # to add labels 8 | 9 | jobs: 10 | label_pull_request: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: thomasjpfan/labeler@v2.5.1 14 | continue-on-error: true 15 | if: github.repository == 'neurodata/treeple' 16 | with: 17 | repo-token: "${{ secrets.GITHUB_TOKEN }}" 18 | configuration-path: ".github/label-globs.yml" 19 | -------------------------------------------------------------------------------- /treeple/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | from ._eiforest import ExtendedIsolationForest 2 | from ._honest_forest import HonestForestClassifier 3 | from ._multiview import MultiViewRandomForestClassifier 4 | from ._supervised_forest import ( 5 | ExtraObliqueRandomForestClassifier, 6 | ExtraObliqueRandomForestRegressor, 7 | ObliqueRandomForestClassifier, 8 | ObliqueRandomForestRegressor, 9 | PatchObliqueRandomForestClassifier, 10 | PatchObliqueRandomForestRegressor, 11 | ) 12 | from ._unsupervised_forest import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest 13 | -------------------------------------------------------------------------------- /.github/workflows/cffconvert.yml: -------------------------------------------------------------------------------- 1 | name: cffconvert 2 | 3 | on: 4 | push: 5 | paths: 6 | - CITATION.cff 7 | pull_request: 8 | paths: 9 | - CITATION.cff 10 | 11 | jobs: 12 | validate: 13 | name: "validate" 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Check out a copy of the repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Check whether the citation metadata from CITATION.cff is valid 20 | uses: citation-file-format/cffconvert-github-action@2.0.0 21 | with: 22 | args: "--validate" 23 | -------------------------------------------------------------------------------- /doc/whats_new/changelog_legend.inc: -------------------------------------------------------------------------------- 1 | Legend for changelogs 2 | --------------------- 3 | 4 | - |MajorFeature|: something big that you couldn't do before. 5 | - |Feature|: something that you couldn't do before. 6 | - |Efficiency|: an existing feature now may not require as much computation or 7 | memory. 8 | - |Enhancement|: a miscellaneous minor improvement. 9 | - |Fix|: something that previously didn't work as documentated -- or according 10 | to reasonable expectations -- should now work. 11 | - |API|: you will need to change your code to have the same effect in the 12 | future; or a feature will be removed in the future. 13 | -------------------------------------------------------------------------------- /treeple/_build_utils/gcc_build_bitness.py: -------------------------------------------------------------------------------- 1 | #!python 2 | """ Detect bitness (32 or 64) of Mingw-w64 gcc build target on Windows. 3 | """ 4 | 5 | import re 6 | from subprocess import run 7 | 8 | 9 | def main(): 10 | res = run(["gcc", "-v"], check=True, text=True, capture_output=True) 11 | target = re.search(r"^Target: (.*)$", res.stderr, flags=re.M).groups()[0] 12 | if target.startswith("i686"): 13 | print("32") 14 | elif target.startswith("x86_64"): 15 | print("64") 16 | else: 17 | raise RuntimeError("Could not detect Mingw-w64 bitness") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /treeple/tree/honesty/meson.build: -------------------------------------------------------------------------------- 1 | tree_extension_metadata = { 2 | '_honest_prune': 3 | {'sources': ['_honest_prune.pyx'], 4 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 5 | } 6 | 7 | foreach ext_name, ext_dict : tree_extension_metadata 8 | py.extension_module( 9 | ext_name, 10 | ext_dict.get('sources'), 11 | dependencies: [np_dep], 12 | override_options : ext_dict.get('override_options', []), 13 | c_args: c_args, 14 | cython_args: cython_c_args, 15 | subdir: 'treeple/tree/honesty', 16 | install: true, 17 | ) 18 | endforeach 19 | 20 | 21 | py.install_sources( 22 | subdir: 'treeple/tree/honesty' # Folder relative to site-packages to install to 23 | ) 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'Feature request' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /treeple/tree/manifold/meson.build: -------------------------------------------------------------------------------- 1 | tree_extension_metadata = { 2 | '_morf_splitter': 3 | {'sources': ['_morf_splitter.pyx'], 4 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 5 | } 6 | 7 | foreach ext_name, ext_dict : tree_extension_metadata 8 | py.extension_module( 9 | ext_name, 10 | ext_dict.get('sources'), 11 | dependencies: [np_dep], 12 | override_options : ext_dict.get('override_options', []), 13 | c_args: c_args, 14 | cython_args: cython_c_args, 15 | subdir: 'treeple/tree/manifold', 16 | install: true, 17 | ) 18 | endforeach 19 | 20 | 21 | py.install_sources( 22 | subdir: 'treeple/tree/manifold' # Folder relative to site-packages to install to 23 | ) 24 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | 4 | ignore = 5 | # these rules don't play well with black 6 | # whitespace before ':' 7 | E203 8 | # line break before binary operator 9 | W503 10 | E241,E305,W504,W605,E731 11 | E402 12 | 13 | exclude = 14 | .git 15 | .github 16 | .venv 17 | .mypy_cache 18 | .pytest_cache 19 | .circleci 20 | paper 21 | doc/_build 22 | doc/generated 23 | doc/auto_examples 24 | validation 25 | build 26 | build-install 27 | dist 28 | treeple/_lib/ 29 | .asv 30 | env 31 | 32 | per-file-ignores = 33 | # __init__.py files are allowed to have unused imports 34 | */__init__.py:F401 35 | */**/__init__.py:F401 36 | -------------------------------------------------------------------------------- /doc/whats_new.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. _whats_new: 4 | 5 | .. include:: whats_new/_contributors.rst 6 | 7 | Release History 8 | =============== 9 | 10 | Release notes for all treeple releases are linked in this page. 11 | 12 | **Tip:** `Subscribe to treeple releases `__ 13 | on libraries.io to be notified when new versions are released. 14 | 15 | .. toctree:: 16 | :maxdepth: 1 17 | 18 | Version 0.1 19 | Version 0.2 20 | Version 0.3 21 | Version 0.4 22 | Version 0.5 23 | Version 0.6 24 | Version 0.7 25 | Version 0.8 26 | Version 0.9 27 | Version 0.10 (Unreleased) 28 | 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Distribution / packaging 2 | .Python 3 | dist/ 4 | *.egg* 5 | build 6 | build-install/ 7 | coverage 8 | *.xml 9 | .venv 10 | .pymon 11 | .coverage.* 12 | 13 | commit.txt 14 | treeple/_lib/sklearn/ 15 | 16 | *.png 17 | _data 18 | 19 | # Sphinx documentation 20 | doc/_build/ 21 | doc/generated/ 22 | doc/auto_examples/ 23 | doc/auto_tutorials/ 24 | doc/modules/generated/ 25 | doc/sphinxext/cachedir 26 | pip-log.txt 27 | .coverage 28 | tags 29 | doc/coverages 30 | doc/samples 31 | cover 32 | examples/*.jpg 33 | examples/**/*.jpg 34 | 35 | env/ 36 | html/ 37 | results/ 38 | scikit-learn/ 39 | benchmarks/cache/ 40 | 41 | # Pycharm 42 | .idea/ 43 | 44 | *.pyc 45 | 46 | *.so 47 | *.cpp 48 | *.c 49 | 50 | .cache 51 | .pytest_cache 52 | .ipynb_checkpoints 53 | .DS_Store 54 | .vscode/ 55 | 56 | __pycache__ 57 | 58 | # Profiling 59 | profiling/ 60 | *.prof -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=mne_bids 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [adam2392, PSSF23, sampan501, SUKI-O] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: adam2392 # Replace with a single Buy Me a Coffee username 14 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 15 | -------------------------------------------------------------------------------- /doc/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {%- extends "pydata_sphinx_theme/layout.html" %} 2 | 3 | {% block fonts %} 4 | 5 | 6 | 7 | 8 | 12 | {% endblock %} 13 | 14 | {% block extrahead %} 15 | 16 | 17 | 18 | {{ super() }} 19 | {% endblock %} 20 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | #### Reference Issues/PRs 7 | 13 | 14 | 15 | #### What does this implement/fix? Explain your changes. 16 | 17 | 18 | #### Any other comments? 19 | 20 | 21 | 31 | -------------------------------------------------------------------------------- /doc/_static/style.css: -------------------------------------------------------------------------------- 1 | a[class^="sphx-glr-backref-module-scikit_tree"] { 2 | /* make all MNE-BIDS backrefs bold */ 3 | font-weight: 800; 4 | } 5 | 6 | /* Disable hyphenation in API reference table for Webkit-based browsers 7 | to work around alignment bug */ 8 | #api-documentation table p { 9 | -webkit-hyphens: none; 10 | } 11 | 12 | /* Hide version number from top-left location in the navbar */ 13 | .navbar-version { 14 | display: none; 15 | } 16 | 17 | html { 18 | font-size: 16px; 19 | } 20 | 21 | h1 { 22 | font-size: 1.6rem; 23 | } 24 | 25 | h2 { 26 | font-size: 1.3rem; 27 | } 28 | 29 | h3 { 30 | font-size: 1rem; 31 | font-weight: bold; 32 | } 33 | 34 | h4 { 35 | font-size: 1rem; 36 | } 37 | 38 | .footer { 39 | margin-top: 3em; 40 | padding-top: 1em; 41 | } 42 | 43 | /* Links in the Note boxes */ 44 | .note a { 45 | color: blue; 46 | text-decoration: underline; 47 | } 48 | 49 | .note a:hover { 50 | color: blue; 51 | font-weight: bold; 52 | text-decoration: underline; 53 | } 54 | 55 | /* Links in "Note" boxes */ 56 | .alert-info a code span { 57 | color: blue; 58 | } 59 | -------------------------------------------------------------------------------- /doc/whats_new/v0.10.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _current: 7 | 8 | Version 0.10 9 | ============ 10 | 11 | ** In Development ** 12 | 13 | Changelog 14 | --------- 15 | 16 | - |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the 17 | ``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`) 18 | - |Feature| Added a sparse implementation of `treeple.stats.forest.build_colemen_forest` 19 | that uses the `scipy.sparse` module. By `Ryan Hausen`_ (:pr:`#317`) 20 | - |Feature| :class:`treeple.tree.HonestTreeClassifier` now has a ``honest_method`` parameter 21 | that enables the user to turn on pruning of the tree, such that there are no 22 | empty leaf predictions. This brings the model closer to the implementation in GRF in R. 23 | By `Adam Li`_ (:pr:`#286`) 24 | 25 | 26 | Code and Documentation Contributors 27 | ----------------------------------- 28 | 29 | Thanks to everyone who has contributed to the maintenance and improvement of 30 | the project since version inception, including: 31 | 32 | * `Adam Li`_ 33 | * `Ryan Hausen`_ 34 | -------------------------------------------------------------------------------- /.github/workflows/circle_artifacts.yml: -------------------------------------------------------------------------------- 1 | name: CircleCI artifacts redirector 2 | on: [status] 3 | 4 | # Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this 5 | # github actions workflow: 6 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication 7 | permissions: read-all 8 | 9 | jobs: 10 | circleci_artifacts_redirector_job: 11 | runs-on: ubuntu-20.04 12 | if: "github.repository == 'neurodata/treeple' && github.event.context == 'ci/circleci: build_docs'" 13 | permissions: 14 | statuses: write 15 | name: Run CircleCI artifacts redirector 16 | steps: 17 | - name: GitHub Action step 18 | uses: larsoner/circleci-artifacts-redirector-action@master 19 | with: 20 | repo-token: ${{ secrets.GITHUB_TOKEN }} 21 | api-token: ${{ secrets.CIRCLECI_TOKEN }} 22 | artifact-path: 0/dev/index.html 23 | circleci-jobs: build_docs 24 | job-title: Check the rendered docs here! 25 | 26 | - name: Check the URL 27 | if: github.event.status != 'pending' 28 | run: | 29 | curl --fail ${{ steps.step1.outputs.url }} | grep $GITHUB_SHA 30 | -------------------------------------------------------------------------------- /treeple/tree/_utils.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | 3 | import numpy as np 4 | 5 | cimport numpy as cnp 6 | 7 | cnp.import_array() 8 | 9 | from .._lib.sklearn.tree._splitter cimport SplitRecord 10 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t 11 | 12 | ctypedef fused vector_or_memview: 13 | vector[intp_t] 14 | intp_t[::1] 15 | intp_t[:] 16 | 17 | 18 | cdef void fisher_yates_shuffle( 19 | vector_or_memview indices_to_sample, 20 | intp_t grid_size, 21 | uint32_t* random_state, 22 | ) noexcept nogil 23 | 24 | 25 | cdef int rand_weighted_binary( 26 | float64_t p0, 27 | uint32_t* random_state 28 | ) noexcept nogil 29 | 30 | cpdef unravel_index( 31 | intp_t index, 32 | cnp.ndarray[intp_t, ndim=1] shape 33 | ) 34 | 35 | cpdef ravel_multi_index( 36 | intp_t[:] coords, 37 | const intp_t[:] shape 38 | ) 39 | 40 | cdef void unravel_index_cython( 41 | intp_t index, 42 | const intp_t[:] shape, 43 | vector_or_memview coords 44 | ) noexcept nogil 45 | 46 | cdef intp_t ravel_multi_index_cython( 47 | vector_or_memview coords, 48 | const intp_t[:] shape 49 | ) noexcept nogil 50 | -------------------------------------------------------------------------------- /treeple/experimental/tests/test_simulate.py: -------------------------------------------------------------------------------- 1 | from treeple.experimental.simulate import ( 2 | simulate_helix, 3 | simulate_multivariate_gaussian, 4 | simulate_sphere, 5 | ) 6 | 7 | 8 | # Test simulate_helix function 9 | def test_simulate_helix(): 10 | P, X, Y, Z = simulate_helix(n_samples=1000) 11 | assert len(P) == 1000 12 | assert len(X) == 1000 13 | assert len(Y) == 1000 14 | assert len(Z) == 1000 15 | 16 | # Add more specific tests if necessary 17 | 18 | 19 | # Test simulate_sphere function 20 | def test_simulate_sphere(): 21 | latitude, longitude, Y1, Y2, Y3 = simulate_sphere(n_samples=1000) 22 | assert len(latitude) == 1000 23 | assert len(longitude) == 1000 24 | assert len(Y1) == 1000 25 | assert len(Y2) == 1000 26 | assert len(Y3) == 1000 27 | 28 | # Add more specific tests if necessary 29 | 30 | 31 | # Test simulate_multivariate_gaussian function 32 | def test_simulate_multivariate_gaussian(): 33 | data, mean, cov = simulate_multivariate_gaussian(d=2, n_samples=1000) 34 | assert data.shape == (1000, 2) 35 | assert mean.shape == (2,) 36 | assert cov.shape == (2, 2) 37 | 38 | # Add more specific tests if necessary 39 | -------------------------------------------------------------------------------- /doc/whats_new/v0.9.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_9: 7 | 8 | Version 0.9 9 | =========== 10 | 11 | This release include a rename of the package to from ``scikit-tree`` to ``treeple`` 12 | The users can replace the previous usage as follows: 13 | ``import sktree`` to ``import treeple`` 14 | ``from sktree import tree`` to ``from treeple import tree`` 15 | ``from sktree import ...`` to ``from treeple import ...`` 16 | 17 | Note that the previous version of the package will still be available under the name ``scikit-tree`` on PyPI. 18 | 19 | Changelog 20 | --------- 21 | 22 | - |API| Rename the package to ``treeple``. By `SUKI-O`_ (:pr:`#292`) 23 | - |Fix| Fixed a bug in the predict_proba function of the :class:`treeple.HonestForestClassifier` where posteriors 24 | estimated on empty leaf with ``ignore`` prior would result in ``np.nan`` 25 | values for all trees on that sample. 26 | By `Haoyin Xu`_ (:pr:`#291`) 27 | 28 | Code and Documentation Contributors 29 | ----------------------------------- 30 | 31 | Thanks to everyone who has contributed to the maintenance and improvement of 32 | the project since version inception, including: 33 | 34 | * `Adam Li`_ 35 | * `SUKI-O`_ 36 | * `Haoyin Xu`_ 37 | -------------------------------------------------------------------------------- /treeple/tree/unsupervised/meson.build: -------------------------------------------------------------------------------- 1 | tree_extension_metadata = { 2 | '_unsup_criterion': 3 | {'sources': ['_unsup_criterion.pyx'], 4 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 5 | '_unsup_splitter': 6 | {'sources': ['_unsup_splitter.pyx'], 7 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 8 | '_unsup_tree': 9 | {'sources': ['_unsup_tree.pyx'], 10 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 11 | '_unsup_oblique_splitter': 12 | {'sources': ['_unsup_oblique_splitter.pyx'], 13 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 14 | '_unsup_oblique_tree': 15 | {'sources': ['_unsup_oblique_tree.pyx'], 16 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 17 | } 18 | 19 | foreach ext_name, ext_dict : tree_extension_metadata 20 | py.extension_module( 21 | ext_name, 22 | ext_dict.get('sources'), 23 | dependencies: [np_dep], 24 | override_options : ext_dict.get('override_options', []), 25 | c_args: c_args, 26 | cython_args: cython_c_args, 27 | subdir: 'treeple/tree/unsupervised', 28 | install: true, 29 | ) 30 | endforeach 31 | 32 | 33 | py.install_sources( 34 | subdir: 'treeple/tree/unsupervised' # Folder relative to site-packages to install to 35 | ) 36 | -------------------------------------------------------------------------------- /treeple/tree/__init__.py: -------------------------------------------------------------------------------- 1 | from .._lib.sklearn.tree import ( 2 | DecisionTreeClassifier, 3 | DecisionTreeRegressor, 4 | ExtraTreeClassifier, 5 | ExtraTreeRegressor, 6 | ) 7 | from ._classes import ( 8 | ExtraObliqueDecisionTreeClassifier, 9 | ExtraObliqueDecisionTreeRegressor, 10 | ObliqueDecisionTreeClassifier, 11 | ObliqueDecisionTreeRegressor, 12 | PatchObliqueDecisionTreeClassifier, 13 | PatchObliqueDecisionTreeRegressor, 14 | UnsupervisedDecisionTree, 15 | UnsupervisedObliqueDecisionTree, 16 | ) 17 | from ._honest_tree import HonestTreeClassifier 18 | from ._multiview import MultiViewDecisionTreeClassifier 19 | from ._neighbors import compute_forest_similarity_matrix 20 | 21 | __all__ = [ 22 | "ExtraObliqueDecisionTreeClassifier", 23 | "ExtraObliqueDecisionTreeRegressor", 24 | "compute_forest_similarity_matrix", 25 | "UnsupervisedDecisionTree", 26 | "UnsupervisedObliqueDecisionTree", 27 | "ObliqueDecisionTreeClassifier", 28 | "ObliqueDecisionTreeRegressor", 29 | "PatchObliqueDecisionTreeClassifier", 30 | "PatchObliqueDecisionTreeRegressor", 31 | "HonestTreeClassifier", 32 | "DecisionTreeClassifier", 33 | "DecisionTreeRegressor", 34 | "ExtraTreeClassifier", 35 | "ExtraTreeRegressor", 36 | "MultiViewDecisionTreeClassifier", 37 | ] 38 | -------------------------------------------------------------------------------- /.github/workflows/style.yml: -------------------------------------------------------------------------------- 1 | name: "Style checks" 2 | 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | pull_request: 9 | paths: 10 | - "**.py" 11 | - "**.pxd" 12 | - "**.pyx" 13 | push: 14 | branches: [main] 15 | paths: 16 | - "**.py" 17 | tags: 18 | - "v*.*.*" 19 | workflow_dispatch: 20 | 21 | permissions: 22 | contents: read # to fetch code (actions/checkout) 23 | 24 | jobs: 25 | style: 26 | name: Formatting, lint, style, and type-checks 27 | timeout-minutes: 10 28 | runs-on: ubuntu-latest 29 | steps: 30 | - name: Checkout repository 31 | uses: actions/checkout@v4 32 | - name: Setup Python 3.11 33 | uses: actions/setup-python@v5 34 | with: 35 | python-version: "3.11" 36 | architecture: "x64" 37 | 38 | - name: Install packages for Ubuntu 39 | run: | 40 | sudo apt-get update 41 | sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev 42 | 43 | - name: Install dependencies 44 | run: | 45 | pip install --upgrade pip 46 | pip install -r style_requirements.txt 47 | 48 | # check formatting of the code style 49 | - name: Check code formatting 50 | run: make pre-commit 51 | -------------------------------------------------------------------------------- /doc/_static/versions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "0.10", 4 | "version": "dev", 5 | "url": "https://docs.neurodata.io/treeple/dev/" 6 | }, 7 | { 8 | "name": "0.9", 9 | "version": "0.9", 10 | "url": "https://docs.neurodata.io/treeple/v0.9/" 11 | }, 12 | { 13 | "name": "0.8", 14 | "version": "0.8", 15 | "url": "https://docs.neurodata.io/treeple/v0.8/" 16 | }, 17 | { 18 | "name": "0.7", 19 | "version": "0.7", 20 | "url": "https://docs.neurodata.io/treeple/v0.7/" 21 | }, 22 | { 23 | "name": "0.6", 24 | "version": "0.6", 25 | "url": "https://docs.neurodata.io/treeple/v0.6/" 26 | }, 27 | { 28 | "name": "0.5", 29 | "version": "0.5", 30 | "url": "https://docs.neurodata.io/treeple/v0.5/" 31 | }, 32 | { 33 | "name": "0.4", 34 | "version": "0.4", 35 | "url": "https://docs.neurodata.io/treeple/v0.4/" 36 | }, 37 | { 38 | "name": "0.3", 39 | "version": "0.3", 40 | "url": "https://docs.neurodata.io/treeple/v0.3/" 41 | }, 42 | { 43 | "name": "0.2", 44 | "version": "0.2", 45 | "url": "https://docs.neurodata.io/treeple/v0.2/" 46 | }, 47 | { 48 | "name": "0.1", 49 | "version": "0.1", 50 | "url": "https://docs.neurodata.io/treeple/v0.1/" 51 | } 52 | ] 53 | -------------------------------------------------------------------------------- /benchmarks/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import balanced_accuracy_score, r2_score 3 | 4 | 5 | def neg_mean_inertia(X, labels, centers): 6 | return -(np.asarray(X - centers[labels]) ** 2).sum(axis=1).mean() 7 | 8 | 9 | def make_gen_classif_scorers(caller): 10 | caller.train_scorer = balanced_accuracy_score 11 | caller.test_scorer = balanced_accuracy_score 12 | 13 | 14 | def make_gen_reg_scorers(caller): 15 | caller.test_scorer = r2_score 16 | caller.train_scorer = r2_score 17 | 18 | 19 | def neg_mean_data_error(X, U, V): 20 | return -np.sqrt(((X - U.dot(V)) ** 2).mean()) 21 | 22 | 23 | def make_dict_learning_scorers(caller): 24 | caller.train_scorer = lambda _, __: ( 25 | neg_mean_data_error( 26 | caller.X, caller.estimator.transform(caller.X), caller.estimator.components_ 27 | ) 28 | ) 29 | caller.test_scorer = lambda _, __: ( 30 | neg_mean_data_error( 31 | caller.X_val, 32 | caller.estimator.transform(caller.X_val), 33 | caller.estimator.components_, 34 | ) 35 | ) 36 | 37 | 38 | def explained_variance_ratio(Xt, X): 39 | return np.var(Xt, axis=0).sum() / np.var(X, axis=0).sum() 40 | 41 | 42 | def make_pca_scorers(caller): 43 | caller.train_scorer = lambda _, __: caller.estimator.explained_variance_ratio_.sum() 44 | caller.test_scorer = lambda _, __: ( 45 | explained_variance_ratio(caller.estimator.transform(caller.X_val), caller.X_val) 46 | ) 47 | -------------------------------------------------------------------------------- /treeple/tree/meson.build: -------------------------------------------------------------------------------- 1 | tree_extension_metadata = { 2 | '_sklearn_splitter': 3 | {'sources': ['_sklearn_splitter.pyx'], 4 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 5 | '_oblique_splitter': 6 | {'sources': ['_oblique_splitter.pyx'], 7 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 8 | '_oblique_tree': 9 | {'sources': ['_oblique_tree.pyx'], 10 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 11 | '_utils': 12 | {'sources': ['_utils.pyx'], 13 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 14 | '_marginal': 15 | {'sources': ['_marginal.pyx'], 16 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 17 | } 18 | 19 | foreach ext_name, ext_dict : tree_extension_metadata 20 | py.extension_module( 21 | ext_name, 22 | ext_dict.get('sources'), 23 | dependencies: [np_dep], 24 | override_options : ext_dict.get('override_options', []), 25 | c_args: c_args, 26 | cython_args: cython_c_args, 27 | subdir: 'treeple/tree', 28 | install: true, 29 | ) 30 | endforeach 31 | 32 | python_sources = [ 33 | '__init__.py', 34 | '_classes.py', 35 | '_multiview.py', 36 | '_neighbors.py', 37 | '_honest_tree.py', 38 | '_marginalize.py', 39 | ] 40 | 41 | py.install_sources( 42 | python_sources, 43 | subdir: 'treeple/tree' # Folder relative to site-packages to install to 44 | ) 45 | 46 | subdir('tests') 47 | subdir('unsupervised') 48 | subdir('manifold') 49 | subdir('honesty') -------------------------------------------------------------------------------- /treeple/tree/unsupervised/_unsup_oblique_tree.pxd: -------------------------------------------------------------------------------- 1 | # distutils: language = c++ 2 | 3 | # Authors: Adam Li 4 | # 5 | # License: BSD 3 clause 6 | 7 | # See _unsup_oblique_tree.pyx for details. 8 | 9 | import numpy as np 10 | 11 | cimport numpy as cnp 12 | from libcpp.vector cimport vector 13 | 14 | from ..._lib.sklearn.tree._splitter cimport SplitRecord 15 | from ..._lib.sklearn.tree._tree cimport Node 16 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t 17 | from .._oblique_splitter cimport ObliqueSplitRecord 18 | from ._unsup_tree cimport UnsupervisedTree 19 | 20 | 21 | cdef class UnsupervisedObliqueTree(UnsupervisedTree): 22 | cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors 23 | cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors 24 | 25 | # overridden methods 26 | cdef int _resize_c( 27 | self, 28 | intp_t capacity=* 29 | ) except -1 nogil 30 | cdef int _set_split_node( 31 | self, 32 | SplitRecord* split_node, 33 | Node *node, 34 | intp_t node_id, 35 | ) except -1 nogil 36 | cdef float32_t _compute_feature( 37 | self, 38 | const float32_t[:, :] X_ndarray, 39 | intp_t sample_index, 40 | Node *node 41 | ) noexcept nogil 42 | cdef void _compute_feature_importances( 43 | self, 44 | float64_t[:] importances, 45 | Node* node 46 | ) noexcept nogil 47 | 48 | cpdef cnp.ndarray get_projection_matrix(self) 49 | -------------------------------------------------------------------------------- /treeple/tree/_oblique_tree.pxd: -------------------------------------------------------------------------------- 1 | # distutils: language = c++ 2 | 3 | # Authors: Adam Li 4 | # Chester Huynh 5 | # Parth Vora 6 | # 7 | # License: BSD 3 clause 8 | 9 | # See _oblique_tree.pyx for details. 10 | 11 | import numpy as np 12 | 13 | cimport numpy as cnp 14 | from libcpp.vector cimport vector 15 | 16 | from .._lib.sklearn.tree._splitter cimport SplitRecord 17 | from .._lib.sklearn.tree._tree cimport Node, Tree, TreeBuilder 18 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t 19 | from ._oblique_splitter cimport ObliqueSplitRecord 20 | 21 | 22 | cdef class ObliqueTree(Tree): 23 | cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors 24 | cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors 25 | 26 | # overridden methods 27 | cdef int _resize_c( 28 | self, 29 | intp_t capacity=* 30 | ) except -1 nogil 31 | cdef int _set_split_node( 32 | self, 33 | SplitRecord* split_node, 34 | Node *node, 35 | intp_t node_id 36 | ) except -1 nogil 37 | cdef float32_t _compute_feature( 38 | self, 39 | const float32_t[:, :] X_ndarray, 40 | intp_t sample_index, 41 | Node *node 42 | ) noexcept nogil 43 | cdef void _compute_feature_importances( 44 | self, 45 | float64_t[:] importances, 46 | Node* node 47 | ) noexcept nogil 48 | 49 | cpdef cnp.ndarray get_projection_matrix(self) 50 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # YAML 1.2 2 | --- 3 | # Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/) 4 | cff-version: 1.2.0 5 | title: "treeple: Modern decision-trees compatible with scikit-learn in Python." 6 | abstract: "treeple is a scikit-learn compatible API for building state-of-the-art decision trees. These include unsupervised trees, oblique trees, uncertainty trees, quantile trees and causal trees." 7 | authors: 8 | - given-names: Adam 9 | family-names: Li 10 | affiliation: "Department of Computer Science, Columbia University, New York, NY, USA" 11 | orcid: "https://orcid.org/0000-0001-8421-365X" 12 | - given-names: Sambit 13 | family-names: Panda 14 | affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA" 15 | orcid: "https://orcid.org/0000-0001-8455-4243" 16 | - given-names: Haoyin 17 | family-names: Xu 18 | affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA" 19 | orcid: "https://orcid.org/0000-0001-8235-4950" 20 | - given-names: Itsuki 21 | family-names: Ogihara 22 | affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA" 23 | type: software 24 | repository-code: "https://github.com/neurodata/treeple" 25 | license: 'PolyForm-Noncommercial-1.0.0' 26 | keywords: 27 | - random forest 28 | - oblique trees 29 | - honest forests 30 | - statisical learning 31 | - machine learning 32 | message: >- 33 | Please cite this software using the metadata from 34 | 'preferred-citation' in the CITATION.cff file. 35 | -------------------------------------------------------------------------------- /doc/whats_new/v0.5.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_5: 7 | 8 | Version 0.5 9 | =========== 10 | 11 | This release includes a number of enhancements and bug fixes, mainly 12 | to the :class:`treeple.tree.MultiViewDecisionTreeClassifier`. Most notably, 13 | the ``max_features`` argument now supports an array of values, which 14 | applies a different ``max_features`` argument per feature view. 15 | 16 | Changelog 17 | --------- 18 | 19 | - |Enhancement| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now 20 | rounds up the number of features to split on to the nearest integer when 21 | applying ``max_features`` to each feature view, by `Adam Li`_ (:pr:`#183`). 22 | - |Feature| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now 23 | supports an array passed in for ``max_features``, which applies a different 24 | max_features argument per view, by `Adam Li`_ (:pr:`#183`). 25 | - |Fix| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now correctly 26 | handles the case where there is one feature view that is exhausted, and 27 | another that is not for ``apply_max_features_per_feature_set = False``, 28 | by `Adam Li`_ (:pr:`#183`). 29 | - |Fix| ``treeple.stats.FeatureImportanceForestClassifier`` now correctly passes 30 | metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`). 31 | 32 | Code and Documentation Contributors 33 | ----------------------------------- 34 | 35 | Thanks to everyone who has contributed to the maintenance and improvement of 36 | the project since version inception, including: 37 | 38 | * `Adam Li`_ 39 | 40 | -------------------------------------------------------------------------------- /doc/whats_new/_contributors.rst: -------------------------------------------------------------------------------- 1 | 2 | .. 3 | This file maps contributor names to their URLs. It should mostly be used 4 | for core contributors, and occasionally for contributors who do not want 5 | their github page to be their URL target. Historically it was used to 6 | hyperlink all contributors' names, and ``:user:`` should now be preferred. 7 | It also defines other ReST substitutions. 8 | 9 | .. role:: raw-html(raw) 10 | :format: html 11 | 12 | .. role:: raw-latex(raw) 13 | :format: latex 14 | 15 | .. |MajorFeature| replace:: :raw-html:`Major Feature` :raw-latex:`{\small\sc [Major Feature]}` 16 | .. |Feature| replace:: :raw-html:`Feature` :raw-latex:`{\small\sc [Feature]}` 17 | .. |Efficiency| replace:: :raw-html:`Efficiency` :raw-latex:`{\small\sc [Efficiency]}` 18 | .. |Enhancement| replace:: :raw-html:`Enhancement` :raw-latex:`{\small\sc [Enhancement]}` 19 | .. |Fix| replace:: :raw-html:`Fix` :raw-latex:`{\small\sc [Fix]}` 20 | .. |API| replace:: :raw-html:`API Change` :raw-latex:`{\small\sc [API Change]}` 21 | 22 | 23 | .. _Adam Li: https://adam2392.github.io 24 | .. _Jong Shin: https://github.com/jshinm 25 | .. _Sambit Panda: https://sampan.me 26 | .. _SUKI-O : https://github.com/SUKI-O 27 | .. _Ronan Perry : https://rflperry.github.io/ 28 | .. _Haoyin Xu : https://github.com/PSSF23 29 | .. _Yuxin Bai : https://github.com/YuxinB 30 | .. _Ryan Hausen : https://ryanhausen.github.io 31 | -------------------------------------------------------------------------------- /benchmarks/config.json: -------------------------------------------------------------------------------- 1 | { 2 | // "regular": Bencharks are run on small to medium datasets. Each benchmark 3 | // is run multiple times and averaged. 4 | // "fast": Benchmarks are run on small to medium datasets. Each benchmark 5 | // is run only once. May provide unstable benchmarks. 6 | // "large_scale": Benchmarks are run on large datasets. Each benchmark is 7 | // run multiple times and averaged. This profile is meant to 8 | // benchmark scalability and will take hours on single core. 9 | // Can be overridden by environment variable SKLBENCH_PROFILE. 10 | "profile": "regular", 11 | 12 | // List of values of n_jobs to use for estimators which accept this 13 | // parameter (-1 means all cores). An empty list means all values from 1 to 14 | // the maximum number of available cores. 15 | // Can be overridden by environment variable SKLBENCH_NJOBS. 16 | "n_jobs_vals": [1], 17 | 18 | // If true, fitted estimators are saved in ./cache/estimators/ 19 | // Can be overridden by environment variable SKLBENCH_SAVE_ESTIMATORS. 20 | "save_estimators": false, 21 | 22 | // Commit hash to compare estimator predictions with. 23 | // If null, predictions are not compared. 24 | // Can be overridden by environment variable SKLBENCH_BASE_COMMIT. 25 | "base_commit": null, 26 | 27 | // If false, the predict (resp. transform) method of the estimators won't 28 | // be benchmarked. 29 | // Can be overridden by environment variables SKLBENCH_PREDICT and 30 | // SKLBENCH_TRANSFORM. 31 | "bench_predict": true, 32 | "bench_transform": true 33 | } -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: "Release to PyPI" 2 | 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | release: 9 | types: [published] 10 | workflow_run: 11 | workflows: [Build_Wheels] 12 | branches: [main] 13 | types: [completed] # This ensures it triggers only after the workflow completes 14 | workflow_dispatch: 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | pypi: 21 | runs-on: ubuntu-latest 22 | if: github.event_name == 'release' 23 | permissions: 24 | id-token: write 25 | steps: 26 | - name: Get run ID of "Build_Wheels" workflow 27 | id: get-run-id 28 | run: | 29 | OTHER_REPO="${{ github.repository }}" 30 | WF_NAME="Build_Wheels" 31 | RUN_ID=`gh run --repo ${OTHER_REPO} list --workflow ${WF_NAME} --json databaseId --jq .[0].databaseId` 32 | echo "Detected latest run id of ${RUN_ID} for workflow ${WF_NAME}" 33 | echo "run-id=${RUN_ID}" >> "$GITHUB_OUTPUT" 34 | env: 35 | GH_TOKEN: ${{ github.token }} 36 | 37 | - name: Download artifact from "Build_Wheels" workflow 38 | uses: actions/download-artifact@v4 39 | with: 40 | name: dist # Match name used in build_wheels.yml upload artifact step 41 | path: dist 42 | github-token: ${{ github.token }} 43 | repository: ${{ github.repository }} 44 | run-id: ${{ steps.get-run-id.outputs.run-id }} 45 | 46 | - name: Show downloaded files 47 | run: ls -la 48 | 49 | - name: Publish to PyPI 50 | uses: pypa/gh-action-pypi-publish@release/v1 51 | -------------------------------------------------------------------------------- /doc/whats_new/v0.3.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_3: 7 | 8 | Version 0.3 9 | =========== 10 | 11 | This release includes a number of bug fixes and enhancements related to hypothesis testing with decision trees. 12 | Moreover, we have added an experimental multi-view decision tree / random forest, which considers multiple views 13 | of the data when building trees. The documentation page has also undergone an organizational overhaul 14 | making it easier for users to find examples related to specific use cases. 15 | 16 | Changelog 17 | --------- 18 | - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`) 19 | - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`) 20 | - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`) 21 | - |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`) 22 | - |Fix| Fixed usage of ``feature_importances_`` property in ``HonestForestClassifier``, by `Adam Li`_ (:pr:`156`) 23 | - |Fix| Fixed ``HonestForestClassifier`` to allow decision-trees from sklearn, albeit with a limited API, by `Adam Li`_ (:pr:`158`) 24 | 25 | Code and Documentation Contributors 26 | ----------------------------------- 27 | 28 | Thanks to everyone who has contributed to the maintenance and improvement of 29 | the project since version inception, including: 30 | 31 | * `Adam Li`_ 32 | * `Sambit Panda`_ 33 | * `Yuxin Bai`_ 34 | -------------------------------------------------------------------------------- /treeple/experimental/tests/test_mutual_info.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def nonlinear_gaussian_with_additive_noise(): 5 | """Nonlinear no-noise function with additive Gaussian noise. 6 | 7 | See: https://github.com/BiuBiuBiLL/NPEET_LNC/issues/4 8 | """ 9 | # first simulate multivariate Gaussian without noise 10 | 11 | # then add the noise 12 | 13 | # compute MI by computing the H(Y|X) and H(X) 14 | # H(Y|X) = np.log(noise_std) 15 | # H(X) = kNN K-L estimate with large # of samples 16 | pass 17 | 18 | 19 | def main(): 20 | d1 = [1, 1, 0] 21 | d2 = [1, 0, 1] 22 | d3 = [0, 1, 1] 23 | mat = [d1, d2, d3] 24 | tmat = np.transpose(mat) 25 | diag = [[3, 0, 0], [0, 1, 0], [0, 0, 1]] 26 | # mean = np.array([0, 0, 0]) 27 | cov = np.dot(tmat, np.dot(diag, mat)) 28 | print("covariance matrix") 29 | print(cov) 30 | print(tmat) 31 | 32 | 33 | def test_mi(): 34 | d1 = [1, 1, 0] 35 | d2 = [1, 0, 1] 36 | d3 = [0, 1, 1] 37 | mat = [d1, d2, d3] 38 | tmat = np.transpose(mat) 39 | diag = [[3, 0, 0], [0, 1, 0], [0, 0, 1]] 40 | # mean = np.array([0, 0, 0]) 41 | cov = np.dot(tmat, np.dot(diag, mat)) 42 | print("covariance matrix") 43 | print(cov) 44 | trueent = -0.5 * (3 + np.log(8.0 * np.pi * np.pi * np.pi * np.linalg.det(cov))) 45 | trueent += -0.5 * (1 + np.log(2.0 * np.pi * cov[2][2])) # z sub 46 | trueent += 0.5 * ( 47 | 2 48 | + np.log( 49 | 4.0 * np.pi * np.pi * np.linalg.det([[cov[0][0], cov[0][2]], [cov[2][0], cov[2][2]]]) 50 | ) 51 | ) # xz sub 52 | trueent += 0.5 * ( 53 | 2 54 | + np.log( 55 | 4.0 * np.pi * np.pi * np.linalg.det([[cov[1][1], cov[1][2]], [cov[2][1], cov[2][2]]]) 56 | ) 57 | ) # yz sub 58 | print("true CMI(x:y|x)", trueent / np.log(2)) 59 | -------------------------------------------------------------------------------- /doc/whats_new/v0.8.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_8: 7 | 8 | Version 0.8 9 | =========== 10 | 11 | This development fixes a major bug with (CO)MIGHT, where low sample sizes produce biased tree 12 | posteriors, which is fixed by stratifying the sampling of the dataset to ensure that each class 13 | is represented in the bootstrap sample. Additionally, the release includes a number of bug fixes 14 | and improvements to the codebase. 15 | 16 | Changelog 17 | --------- 18 | 19 | - |Fix| Previously missing-values in ``X`` input array for treeple estimators 20 | did not raise an error, and silently ran, assuming the missing-values were 21 | encoded as infinity value. This is now fixed, and the estimators will raise an 22 | ValueError if missing-values are encountered in ``X`` input array. 23 | By `Adam Li`_ (:pr:`#264`) 24 | - |Feature| Simulations in ``treeple.datasets.hyppo`` now throw a warning instead 25 | of an error when the number of samples is less than the number of dimensions. 26 | By `Sambit Panda`_ (:pr:`#279`) 27 | - |API| :class:`treeple.HonestForestClassifier` now has ``bootstrap=True`` as the default 28 | argument. By `Adam Li`_ (:pr:`#274`) 29 | - |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated 30 | MIGHT code. By `Adam Li`_ (:pr:`#274`) 31 | - |Fix| Fixed a bug in the ``treeple.HonestForestClassifier`` where posteriors 32 | estimated on oob samples were biased when there was a low number of samples 33 | due to imbalance in the classes when ``bootstrap=True``. 34 | By `Adam Li`_ (:pr:`#283`) 35 | 36 | Code and Documentation Contributors 37 | ----------------------------------- 38 | 39 | Thanks to everyone who has contributed to the maintenance and improvement of 40 | the project since version inception, including: 41 | 42 | * `Adam Li`_ 43 | * `Sambit Panda`_ 44 | -------------------------------------------------------------------------------- /doc/sphinxext/doi_role.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | doilinks 4 | ~~~~~~~~ 5 | Extension to add links to DOIs. With this extension you can use e.g. 6 | :doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will 7 | create a link to a DOI resolver 8 | (``https://doi.org/10.1016/S0022-2836(05)80360-2``). 9 | The link caption will be the raw DOI. 10 | You can also give an explicit caption, e.g. 11 | :doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`. 12 | 13 | :copyright: Copyright 2015 Jon Lund Steffensen. Based on extlinks by 14 | the Sphinx team. 15 | :license: BSD. 16 | """ 17 | 18 | from docutils import nodes, utils 19 | from sphinx.util.nodes import split_explicit_title 20 | 21 | 22 | def reference_role(typ, rawtext, text, lineno, inliner, options={}, content=[]): 23 | text = utils.unescape(text) 24 | has_explicit_title, title, part = split_explicit_title(text) 25 | if typ in ["arXiv", "arxiv"]: 26 | full_url = "https://arxiv.org/abs/" + part 27 | if not has_explicit_title: 28 | title = "arXiv:" + part 29 | pnode = nodes.reference(title, title, internal=False, refuri=full_url) 30 | return [pnode], [] 31 | if typ in ["doi", "DOI"]: 32 | full_url = "https://doi.org/" + part 33 | if not has_explicit_title: 34 | title = "DOI:" + part 35 | pnode = nodes.reference(title, title, internal=False, refuri=full_url) 36 | return [pnode], [] 37 | 38 | 39 | def setup_link_role(app): 40 | app.add_role("arxiv", reference_role, override=True) 41 | app.add_role("arXiv", reference_role, override=True) 42 | app.add_role("doi", reference_role, override=True) 43 | app.add_role("DOI", reference_role, override=True) 44 | 45 | 46 | def setup(app): 47 | app.connect("builder-inited", setup_link_role) 48 | return {"version": "0.1", "parallel_read_safe": True} 49 | -------------------------------------------------------------------------------- /doc/whats_new/v0.4.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_4: 7 | 8 | Version 0.4 9 | =========== 10 | 11 | This version patches some issues with the ``FeatureImportance*`` classes and also adds a feature to the 12 | `MultiViewDecisionTreeClassifier` class that allows one to scale the number of split candidates sampled per feature-set 13 | equally. 14 | 15 | Changelog 16 | --------- 17 | 18 | - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) 19 | - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`) 20 | - |Fix| Fixes a bug where ``FeatureImportanceForest*`` was unable to be run when calling ``statistic`` with ``covariate_index`` defined for MI, AUC metrics, by `Adam Li`_ (:pr:`164`) 21 | - |Enhancement| Add :func:`treeple.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`170`) 22 | - |Enhancement| Multi-view trees now are able to scale the sampling of split candidates at the same rate per feature-set now, which means 'sqrt' would sample split candidates equal to the square root of each feature-set size, by `Adam Li`_ (:pr:`152`) 23 | - |FIX| Fixes bug in :class:`treeple.tree.MultiViewDecisionTreeClassifier` where the max_features argument applied over 24 | more than two views with ``apply_max_features_per_set`` set to ``True`` results in an incorrect and oversampled 25 | number of max_features in the views after the first two, by `Adam Li`_ (:pr:`172`) 26 | 27 | Code and Documentation Contributors 28 | ----------------------------------- 29 | 30 | Thanks to everyone who has contributed to the maintenance and improvement of 31 | the project since version inception, including: 32 | 33 | * `Adam Li`_ 34 | 35 | -------------------------------------------------------------------------------- /doc/whats_new/v0.2.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_2: 7 | 8 | Version 0.2 9 | =========== 10 | 11 | This release is a major release, with many new features and improvements. 12 | For instance, we have added a new implementation of the extended isolation forest, 13 | enabled all decision trees to take advantage of ``partial_fit`` meaning trees have streaming 14 | capabilities. Moreover, we have added an analogous implementation of extra-trees for oblique-trees. 15 | Finally, this release includes a highly experimental feature for multivariate high-dimensional 16 | hypothesis testing using permutation forests and a feature importance testing forest. 17 | 18 | Changelog 19 | --------- 20 | - |Efficiency| Upgraded build process to rely on Cython 3.0+, by `Adam Li`_ (:pr:`109`) 21 | - |Feature| Allow decision trees to take advantage of ``partial_fit`` and ``monotonic_cst`` when available, by `Adam Li`_ (:pr:`109`) 22 | - |Feature| Implementation of ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor by `SUKI-O`_ (:pr:`75`) 23 | - |Efficiency| Around 1.5-2x speed improvement for unsupervised forests, by `Adam Li`_ (:pr:`114`) 24 | - |API| Allow ``sqrt`` and ``log2`` keywords to be used for ``min_samples_split`` parameter in unsupervised forests, by `Adam Li`_ (:pr:`114`) 25 | - |Feature| Implement extended isolation forest, by `Adam Li`_ (:pr:`101`) 26 | - |Feature| Implementation of StreamDecisionForest, by `Haoyin Xu`_ and `Adam Li`_ (:pr:`116`) 27 | - |Feature| Implementation of Permutation forests and a feature importance testing forest, by `Haoyin Xu`_, `Adam Li`_, `Sambit Panda`_ (:pr:`125`) 28 | 29 | Code and Documentation Contributors 30 | ----------------------------------- 31 | 32 | Thanks to everyone who has contributed to the maintenance and improvement of 33 | the project since version inception, including: 34 | 35 | * `Adam Li`_ 36 | * `SUKI-O`_ 37 | * `Haoyin Xu`_ 38 | * `Sambit Panda`_ 39 | -------------------------------------------------------------------------------- /treeple/tree/unsupervised/_unsup_splitter.pxd: -------------------------------------------------------------------------------- 1 | from ..._lib.sklearn.tree._splitter cimport BaseSplitter, SplitRecord 2 | from ..._lib.sklearn.tree._tree cimport ParentInfo 3 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t 4 | from ._unsup_criterion cimport UnsupervisedCriterion 5 | 6 | 7 | cdef class UnsupervisedSplitter(BaseSplitter): 8 | """ 9 | Notable changes wrt scikit-learn: 10 | 1. `weighted_n_node_samples` is used as a stopping criterion and just used to 11 | keep count of the "number of samples (weighted)". All samples have a default weight 12 | of '1'. 13 | 2. `X` array instead of `y` array is stored as the criterions are computed over the X 14 | array. 15 | 3. The feature_values memoryview is a feature vector with shared memory among the splitter 16 | and the criterion object. This enables the splitter to assign values to it within the 17 | `node_split` function and then `criterion` automatically can compute relevant statistics 18 | on the shared memoryview into the array. 19 | """ 20 | 21 | # XXX: requires BaseSplitter to not define "criterion" 22 | cdef public UnsupervisedCriterion criterion # criterion computer 23 | cdef const float32_t[:, :] X # feature matrix 24 | cdef intp_t n_total_samples # store the total number of samples 25 | 26 | # Initialization method for unsupervised splitters 27 | cdef int init( 28 | self, 29 | const float32_t[:, :] X, 30 | const float64_t[:] sample_weight 31 | ) except -1 32 | 33 | # Overridden Methods from base class 34 | cdef int node_reset( 35 | self, 36 | intp_t start, 37 | intp_t end, 38 | float64_t* weighted_n_node_samples 39 | ) except -1 nogil 40 | cdef int node_split( 41 | self, 42 | ParentInfo* parent, 43 | SplitRecord* split, 44 | ) except -1 nogil 45 | cdef void node_value( 46 | self, 47 | float64_t* dest 48 | ) noexcept nogil 49 | cdef float64_t node_impurity( 50 | self 51 | ) noexcept nogil 52 | -------------------------------------------------------------------------------- /doc/sphinxext/allow_nan_estimators.py: -------------------------------------------------------------------------------- 1 | from contextlib import suppress 2 | 3 | from docutils import nodes 4 | from docutils.parsers.rst import Directive 5 | from sklearn.utils import all_estimators 6 | from sklearn.utils._test_common.instance_generator import _construct_instances 7 | from sklearn.utils._testing import SkipTest 8 | 9 | 10 | class AllowNanEstimators(Directive): 11 | @staticmethod 12 | def make_paragraph_for_estimator_type(estimator_type): 13 | intro = nodes.list_item() 14 | intro += nodes.strong(text="Estimators that allow NaN values for type ") 15 | intro += nodes.literal(text=f"{estimator_type}") 16 | intro += nodes.strong(text=":\n") 17 | exists = False 18 | lst = nodes.bullet_list() 19 | for name, est_class in all_estimators(type_filter=estimator_type): 20 | with suppress(SkipTest): 21 | est = _construct_instance(est_class) 22 | 23 | if est._get_tags().get("allow_nan"): 24 | module_name = ".".join(est_class.__module__.split(".")[:2]) 25 | class_title = f"{est_class.__name__}" 26 | class_url = f"./generated/{module_name}.{class_title}.html" 27 | item = nodes.list_item() 28 | para = nodes.paragraph() 29 | para += nodes.reference( 30 | class_title, text=class_title, internal=False, refuri=class_url 31 | ) 32 | exists = True 33 | item += para 34 | lst += item 35 | intro += lst 36 | return [intro] if exists else None 37 | 38 | def run(self): 39 | lst = nodes.bullet_list() 40 | for i in ["cluster", "regressor", "classifier", "transformer"]: 41 | item = self.make_paragraph_for_estimator_type(i) 42 | if item is not None: 43 | lst += item 44 | return [lst] 45 | 46 | 47 | def setup(app): 48 | app.add_directive("allow_nan_estimators", AllowNanEstimators) 49 | 50 | return { 51 | "version": "0.1", 52 | "parallel_read_safe": True, 53 | "parallel_write_safe": True, 54 | } 55 | -------------------------------------------------------------------------------- /doc/whats_new/v0.1.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_1: 7 | 8 | Version 0.1 9 | =========== 10 | 11 | Changelog 12 | --------- 13 | - |Feature| Implementation of the two-means Unsupervised Random Forest, by `Adam Li`_ (:pr:`9`) 14 | - |Feature| Implementation of oblique Unsupervised Random Forest, by `Adam Li`_ (:pr:`11`) 15 | - |Feature| Implementation of manifold oblique Random Forest, by `Adam Li`_ (:pr:`21`) 16 | - |Feature| Implementation of fastBIC criterion for unsupervised tree models, by `Adam Li`_ and `Jong Shin`_ (:pr:`45`) 17 | - |Fix| Fix a bug in Patch oblique random forest that samples outside the data boundaries and adds a user guide, by `Adam Li`_ (:pr:`61`) 18 | - |Feature| MORF trees now can sample n-dimensional patches inside an n-dimensional structure sample and make any arbitrary axis discontinuous, by `Adam Li`_ (:pr:`63`) 19 | - |Feature| All tree types can compute similarity and dissimilarity matrices, by `Sambit Panda`_ and `Adam Li`_ (:pr:`64`) 20 | - |Feature| MORF trees now can normalize by feature weight per sample per feature column, by `Adam Li`_ (:pr:`67`) 21 | - |Feature| A general-kernel MORF is now implemented where users can pass in a kernel library, by `Adam Li`_ (:pr:`70`) 22 | - |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`) 23 | - |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`) 24 | - |Feature| Implementation of (conditional) mutual information estimation via unsupervised tree models and added NearestNeighborsMetaEstimator by `Adam Li`_ (:pr:`83`) 25 | - |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`) 26 | 27 | Code and Documentation Contributors 28 | ----------------------------------- 29 | 30 | Thanks to everyone who has contributed to the maintenance and improvement of 31 | the project since version inception, including: 32 | 33 | * `Adam Li`_ 34 | * `Sambit Panda`_ 35 | * `Ronan Perry`_ 36 | * `Haoyin Xu`_ 37 | -------------------------------------------------------------------------------- /treeple/tree/honesty/_honest_prune.pxd: -------------------------------------------------------------------------------- 1 | from ..._lib.sklearn.tree._criterion cimport Criterion 2 | from ..._lib.sklearn.tree._partitioner cimport shift_missing_values_to_left_if_required 3 | from ..._lib.sklearn.tree._splitter cimport SplitRecord, Splitter 4 | from ..._lib.sklearn.tree._tree cimport Node, ParentInfo, Tree 5 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t 6 | 7 | 8 | # for each node, keep track of the node index and the parent index 9 | # within the tree's node array 10 | cdef struct PruningRecord: 11 | intp_t node_idx 12 | intp_t start 13 | intp_t end 14 | float64_t lower_bound 15 | float64_t upper_bound 16 | 17 | 18 | # TODO: this may break the notion of feature importances, as we don't set the node's impurity 19 | # at the child nodes. 20 | cdef class HonestPruner(Splitter): 21 | cdef Tree tree # The tree to be pruned 22 | cdef intp_t capacity # The maximum number of nodes in the pruned tree 23 | cdef intp_t pos # The current position to split left/right children 24 | cdef intp_t n_missing # The number of missing values in the feature currently considered 25 | cdef uint8_t missing_go_to_left 26 | 27 | # TODO: only supports sparse for now. 28 | cdef const float32_t[:, :] X 29 | 30 | cdef int init( 31 | self, 32 | object X, 33 | const float64_t[:, ::1] y, 34 | const float64_t[:] sample_weight, 35 | const uint8_t[::1] missing_values_in_feature_mask, 36 | ) except -1 37 | 38 | # This function is not used, and should be disabled for pruners 39 | cdef int node_split( 40 | self, 41 | ParentInfo* parent_record, 42 | SplitRecord* split, 43 | ) except -1 nogil 44 | 45 | cdef bint check_node_partition_conditions( 46 | self, 47 | SplitRecord* current_split, 48 | float64_t lower_bound, 49 | float64_t upper_bound 50 | ) noexcept nogil 51 | 52 | cdef inline intp_t n_left_samples( 53 | self 54 | ) noexcept nogil 55 | cdef inline intp_t n_right_samples( 56 | self 57 | ) noexcept nogil 58 | 59 | cdef int partition_samples( 60 | self, 61 | intp_t node_idx, 62 | ) noexcept nogil 63 | -------------------------------------------------------------------------------- /benchmarks/ensemble_supervised.py: -------------------------------------------------------------------------------- 1 | from treeple.ensemble import ObliqueRandomForestClassifier 2 | 3 | from .common import Benchmark, Estimator, Predictor 4 | from .datasets import ( 5 | _20newsgroups_highdim_dataset, 6 | _20newsgroups_lowdim_dataset, 7 | _synth_classification_dataset, 8 | ) 9 | from .utils import make_gen_classif_scorers 10 | 11 | 12 | class ObliqueRandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): 13 | """ 14 | Benchmarks for RandomForestClassifier. 15 | """ 16 | 17 | param_names = ["representation", "n_jobs"] 18 | params = (["dense", "sparse"], Benchmark.n_jobs_vals) 19 | 20 | def setup_cache(self): 21 | super().setup_cache() 22 | 23 | def make_data(self, params): 24 | representation, n_jobs = params 25 | 26 | if representation == "sparse": 27 | data = _20newsgroups_highdim_dataset() 28 | else: 29 | data = _20newsgroups_lowdim_dataset() 30 | 31 | return data 32 | 33 | def make_estimator(self, params): 34 | representation, n_jobs = params 35 | 36 | n_estimators = 500 if Benchmark.data_size == "large" else 100 37 | 38 | estimator = ObliqueRandomForestClassifier( 39 | n_estimators=n_estimators, 40 | min_samples_split=10, 41 | max_features="log2", 42 | n_jobs=n_jobs, 43 | random_state=0, 44 | ) 45 | 46 | return estimator 47 | 48 | def make_scorers(self): 49 | make_gen_classif_scorers(self) 50 | 51 | 52 | class ObliqueRandomForestClassifierBenchmarkSynth(Predictor, Estimator, Benchmark): 53 | """ 54 | Benchmarks for Oblique RF Classifier using synthetic classification data. 55 | """ 56 | 57 | param_names = [] 58 | params = () 59 | 60 | def setup_cache(self): 61 | super().setup_cache() 62 | 63 | def make_data(self, params): 64 | data = _synth_classification_dataset(n_samples=10000, n_features=100, n_classes=5) 65 | 66 | return data 67 | 68 | def make_estimator(self, params): 69 | estimator = ObliqueRandomForestClassifier(max_leaf_nodes=15, random_state=0) 70 | 71 | return estimator 72 | 73 | def make_scorers(self): 74 | make_gen_classif_scorers(self) 75 | -------------------------------------------------------------------------------- /treeple/tree/_neighbors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def compute_forest_similarity_matrix(forest, X): 5 | """Compute the similarity matrix of samples in X using a trained forest. 6 | 7 | As an intermediate calculation, the forest applies the dataset and gets 8 | the leaves for each sample. Then, the similarity matrix is computed by 9 | counting the number of times each pair of samples ends up in the same leaf. 10 | 11 | Parameters 12 | ---------- 13 | forest : BaseForest or BaseDecisionTree 14 | The fitted forest. 15 | X : array-like of shape (n_samples, n_features) 16 | The input data. 17 | 18 | Returns 19 | ------- 20 | aff_matrix : array-like of shape (n_samples, n_samples) 21 | The estimated distance matrix. 22 | """ 23 | if hasattr(forest, "estimator_"): 24 | # apply to the leaves 25 | X_leaves = forest.apply(X) 26 | 27 | n_est = forest.n_estimators 28 | else: 29 | # apply to the leaves for a single tree 30 | X_leaves = forest.apply(X)[:, np.newaxis] 31 | n_est = 1 32 | 33 | aff_matrix = sum(np.equal.outer(X_leaves[:, i], X_leaves[:, i]) for i in range(n_est)) 34 | # normalize by the number of trees 35 | aff_matrix = np.divide(aff_matrix, n_est) 36 | return aff_matrix 37 | 38 | 39 | def _compute_distance_matrix(aff_matrix): 40 | """Private function to compute distance matrix after `compute_similarity_matrix`.""" 41 | dists = 1.0 - aff_matrix 42 | return dists 43 | 44 | 45 | # ported from https://github.com/neurodata/hyppo/blob/main/hyppo/independence/_utils.py 46 | class SimMatrixMixin: 47 | """Mixin class to calculate similarity and dissimilarity matrices. 48 | 49 | This augments tree/forest models with the sklearn's nearest-neighbors API. 50 | """ 51 | 52 | def compute_similarity_matrix(self, X): 53 | """ 54 | Compute the similarity matrix of samples in X. 55 | 56 | Parameters 57 | ---------- 58 | X : array-like of shape (n_samples, n_features) 59 | The input data. 60 | 61 | Returns 62 | ------- 63 | sim_matrix : array-like of shape (n_samples, n_samples) 64 | The similarity matrix among the samples. 65 | """ 66 | return compute_forest_similarity_matrix(self, X) 67 | -------------------------------------------------------------------------------- /treeple/stats/tests/test_permuteforest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from numpy.testing import assert_array_equal 4 | from sklearn import datasets 5 | 6 | from treeple.stats import PermutationHonestForestClassifier 7 | 8 | # load the iris dataset (n_samples, 4) 9 | # and randomly permute it 10 | iris = datasets.load_iris() 11 | seed = 12345 12 | rng = np.random.default_rng(seed) 13 | 14 | # remove third class 15 | iris_X = iris.data[iris.target != 2] 16 | iris_y = iris.target[iris.target != 2] 17 | 18 | p = rng.permutation(iris_X.shape[0]) 19 | iris_X = iris_X[p] 20 | iris_y = iris_y[p] 21 | 22 | 23 | def test_permutationforest_errors(): 24 | """Test permutation forest errors when training.""" 25 | n_samples = 10 26 | est = PermutationHonestForestClassifier(n_estimators=10, random_state=0) 27 | 28 | # covariate index must be an iterable 29 | with pytest.raises(RuntimeError, match="covariate_index must be an iterable"): 30 | est.fit(iris_X[:n_samples], iris_y[:n_samples], covariate_index=0) 31 | 32 | # covariate index must be an iterable of ints 33 | with pytest.raises(RuntimeError, match="Not all covariate_index"): 34 | est.fit(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[0, 1.0]) 35 | 36 | # covariate index must not have numbers greater than 37 | with pytest.raises(ValueError, match="The length of the covariate index"): 38 | est.fit( 39 | iris_X[:n_samples], 40 | iris_y[:n_samples], 41 | covariate_index=np.arange(iris_X.shape[1] + 1, dtype=np.intp), 42 | ) 43 | 44 | 45 | @pytest.mark.parametrize("permute_per_tree", [True, False]) 46 | def test_inbag_samples_different_across_forest(permute_per_tree): 47 | """Test that inbag samples are different across trees.""" 48 | n_estimators = 10 49 | est = PermutationHonestForestClassifier( 50 | n_estimators=n_estimators, random_state=0, permute_per_tree=permute_per_tree 51 | ) 52 | 53 | X = iris_X 54 | y = iris_y 55 | est.fit(X, y) 56 | 57 | # covariate index when None is all the features 58 | assert_array_equal(est.covariate_index_, np.arange(X.shape[1], dtype=np.intp)) 59 | 60 | # inbag samples should be different across trees when permute_per_tree=True 61 | permutation_samples_ = est.permutation_indices_ 62 | permutation_samples_ground = permutation_samples_[0] 63 | assert not all( 64 | np.array_equal(permutation_samples_ground, permutation_samples_[idx]) 65 | for idx in range(1, n_estimators) 66 | ) 67 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 24.8.0 4 | hooks: 5 | - id: black 6 | args: [--quiet] 7 | 8 | - repo: https://github.com/pycqa/isort 9 | rev: 5.13.2 10 | hooks: 11 | - id: isort 12 | name: isort (python) 13 | - id: isort 14 | name: isort (cython) 15 | types: [cython] 16 | 17 | - repo: https://github.com/MarcoGorelli/cython-lint 18 | rev: v0.16.2 19 | hooks: 20 | - id: cython-lint 21 | - id: double-quote-cython-strings 22 | 23 | # Ruff treeple 24 | - repo: https://github.com/astral-sh/ruff-pre-commit 25 | rev: v0.6.9 26 | hooks: 27 | - id: ruff 28 | name: ruff treeple 29 | args: ["--fix"] 30 | files: ^treeple/ 31 | 32 | # Ruff tutorials and examples 33 | - repo: https://github.com/astral-sh/ruff-pre-commit 34 | rev: v0.6.9 35 | hooks: 36 | - id: ruff 37 | name: ruff tutorials and examples 38 | # D103: missing docstring in public function 39 | # D400: docstring first line must end with period 40 | args: ["--ignore=D103,D400", "--fix"] 41 | files: ^tutorials/|^examples/ 42 | 43 | # Codespell 44 | - repo: https://github.com/codespell-project/codespell 45 | rev: v2.3.0 46 | hooks: 47 | - id: codespell 48 | additional_dependencies: 49 | - tomli 50 | files: ^treeple/|^doc/|^examples/|^tutorials/ 51 | types_or: [python, bib, rst, inc] 52 | 53 | # yamllint 54 | - repo: https://github.com/adrienverge/yamllint.git 55 | rev: v1.35.1 56 | hooks: 57 | - id: yamllint 58 | args: [--strict, -c, .yamllint.yml] 59 | 60 | # toml-sort 61 | - repo: https://github.com/pappasam/toml-sort 62 | rev: v0.23.1 63 | hooks: 64 | - id: toml-sort 65 | files: ^pyproject\.toml$ 66 | args: ['-i'] 67 | 68 | # mypy 69 | - repo: https://github.com/pre-commit/mirrors-mypy 70 | rev: v1.11.2 71 | hooks: 72 | - id: mypy 73 | # Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former 74 | exclude: ^(benchmarks_nonasv|examples|benchmarks|.spin)/.*$ 75 | additional_dependencies: ["numpy==1.26.2"] 76 | 77 | # rstcheck 78 | - repo: https://github.com/rstcheck/rstcheck.git 79 | rev: v6.2.4 80 | hooks: 81 | - id: rstcheck 82 | additional_dependencies: 83 | - tomli 84 | files: ^(?!doc/use\.rst$).*\.(rst|inc)$ 85 | 86 | ci: 87 | autofix_prs: true 88 | -------------------------------------------------------------------------------- /doc/whats_new/v0.6.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_6: 7 | 8 | Version 0.6 9 | =========== 10 | 11 | This release includes an enhancement mainly in the MultiViewDecisionTreeClassifier 12 | and HonestForestClassifier, and a new generative model for the make_trunk_classification. 13 | 14 | Changelog 15 | --------- 16 | 17 | - |Enhancement| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now 18 | rounds up the number of features to split on to the nearest integer when 19 | applying ``max_features`` to each feature view, by `Adam Li`_ (:pr:`#183`). 20 | - |Feature| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now 21 | supports an array passed in for ``max_features``, which applies a different 22 | max_features argument per view, by `Adam Li`_ (:pr:`#183`). 23 | - |Fix| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now correctly 24 | handles the case where there is one feature view that is exhausted, and 25 | another that is not for ``apply_max_features_per_feature_set = False``, 26 | by `Adam Li`_ (:pr:`#183`). 27 | - |Fix| ``treeple.stats.FeatureImportanceForestClassifier`` now correctly passes 28 | metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`). 29 | - |Enhancement| :func:`treeple.datasets.make_trunk_classification` now 30 | has a generative model based on Trunk and banded covariance, :func:`treeple.datasets.approximate_clf_mutual_information` and 31 | :func:`treeple.datasets.approximate_clf_mutual_information_with_monte_carlo` to 32 | approximate mutual information either numerically or via Monte-Carlo, by `Adam Li`_ and `Haoyin Xu`_ (:pr:`#199`). 33 | - |Enhancement| :class:`treeple.HonestForestClassifier` now has a fitted 34 | property ``oob_samples_``, which reproduces the sample indices per tree that is out 35 | of bag, by `Adam Li`_ (:pr:`#200`). 36 | - |Enhancement| :class:`treeple.HonestForestClassifier` will allow one to bootstrap sample higher 37 | than the number of samples, controlled by the ``max_samples`` keyword argument by `Adam Li`_ (:pr:`#206`). 38 | - |Feature| :class:`treeple.HonestForestClassifier` now allows one to specify 39 | the number of sub-samples to use for the honest trees without having 40 | to bootstrap sample. This is specified by the ``max_samples`` parameter. 41 | By `Adam Li`_ (:pr:`#210`) 42 | 43 | Code and Documentation Contributors 44 | ----------------------------------- 45 | 46 | Thanks to everyone who has contributed to the maintenance and improvement of 47 | the project since version inception, including: 48 | 49 | * `Adam Li`_ 50 | * `Haoyin Xu`_ 51 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project( 2 | 'treeple', 3 | 'c', 'cpp', 'cython', 4 | # Note that the git commit hash cannot be added dynamically here 5 | # That only happens when importing from a git repository. 6 | # See `treeple/__init__.py` 7 | version: '0.10.3', 8 | license: 'PolyForm Noncommercial 1.0.0', 9 | meson_version: '>= 1.1.0', 10 | default_options: [ 11 | 'c_std=c11', 12 | 'cpp_std=c++14', 13 | ], 14 | ) 15 | 16 | cc = meson.get_compiler('c') 17 | cpp = meson.get_compiler('cpp') 18 | 19 | # Check compiler is recent enough (see "Toolchain Roadmap" for details) 20 | if cc.get_id() == 'gcc' 21 | if not cc.version().version_compare('>=8.0') 22 | error('treeple requires GCC >= 8.0') 23 | endif 24 | elif cc.get_id() == 'msvc' 25 | if not cc.version().version_compare('>=19.20') 26 | error('treeple requires at least vc142 (default with Visual Studio 2019) ' + \ 27 | 'when building with MSVC') 28 | endif 29 | endif 30 | 31 | # Suppress warning for deprecated Numpy API. 32 | # Replace with numpy_nodepr_api after Cython 3.0 is out 33 | # '-Wno-maybe-uninitialized' 34 | # numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION' 35 | 36 | # (Suppress warning messages emitted by #warning directives). 37 | _global_c_args = cc.get_supported_arguments( 38 | '-Wno-unused-but-set-variable', 39 | '-Wno-unused-function', 40 | '-Wno-conversion', 41 | '-Wno-misleading-indentation', 42 | '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', 43 | ) 44 | add_project_arguments(_global_c_args, language : 'c') 45 | 46 | # We need -lm for all C code (assuming it uses math functions, which is safe to 47 | # assume for treeple). For C++ it isn't needed, because libstdc++/libc++ is 48 | # guaranteed to depend on it. For Fortran code, Meson already adds `-lm`. 49 | m_dep = cc.find_library('m', required : false) 50 | if m_dep.found() 51 | add_project_link_arguments('-lm', language : 'c') 52 | endif 53 | 54 | cython = find_program( 55 | 'cython', 56 | required: true 57 | ) 58 | if not cython.found() 59 | error('MESON_BUILD_FAILED: Cython3 not found. Please install it.') 60 | endif 61 | 62 | # r = run_command('git', 'submodule', 'update', '--init', check: false) 63 | r = run_command('mv', 'treeple/_lib/sklearn_fork/sklearn', 'treeple/_lib/sklearn', check: false) 64 | 65 | # Setup Python: 66 | # https://mesonbuild.com/Python-module.html 67 | py = import('python').find_installation(pure: false) 68 | 69 | # print some debugging output 70 | message(py.full_path()) 71 | message(py.get_install_dir()) 72 | if py.language_version().version_compare('<3.9') 73 | error('At least Python 3.9 is required.') 74 | endif 75 | 76 | subdir('treeple') 77 | -------------------------------------------------------------------------------- /treeple/tree/unsupervised/_unsup_criterion.pxd: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: language_level=3 4 | 5 | from ..._lib.sklearn.tree._criterion cimport BaseCriterion 6 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t 7 | 8 | # Note: This class is an exact copy of scikit-learn's Criterion 9 | # class, with the exception of the type of the internal structure. 10 | # In scikit-learn, they store a buffer for the y-labels, whereas here 11 | # we store a buffer for the X dataset. 12 | # 13 | # In our criterions, we do not store the 'y-labels' because there are none 14 | # in unsupervised learning. We instead store a memview of the dataset 'X'. 15 | 16 | 17 | cdef class UnsupervisedCriterion(BaseCriterion): 18 | """Abstract unsupervised criterion. 19 | 20 | Notable Changes 21 | --------------- 22 | 1. weighted_n_* : This parameter keeps track of the total "weight" of the samples 23 | in the node, left and right 24 | """ 25 | 26 | # The criterion computes the impurity of a node and the reduction of 27 | # impurity of a split on that node. It also computes the output statistics. 28 | 29 | # Internal structures 30 | cdef const float32_t[:] feature_values # 1D memview for the feature vector to compute criterion on 31 | 32 | # Keep running total of Xf[samples[start:end]] and the corresponding sum in 33 | # the left and right node. For example, this can then efficiently compute the 34 | # mean of the node, and left/right child by subtracting relevant Xf elements 35 | # and then dividing by the total number of samples in the node and left/right child. 36 | cdef float64_t sum_total # The sum of the weighted count of each feature. 37 | cdef float64_t sum_left # Same as above, but for the left side of the split 38 | cdef float64_t sum_right # Same as above, but for the right side of the split 39 | 40 | cdef float64_t sumsq_total # The sum of the weighted count of each feature. 41 | cdef float64_t sumsq_left # Same as above, but for the left side of the split 42 | cdef float64_t sumsq_right # Same as above, but for the right side of the split 43 | 44 | # Methods 45 | # ------- 46 | # The 'init' method is copied here with the almost the exact same signature 47 | # as that of supervised learning criterion in scikit-learn to ensure that 48 | # Unsupervised criterion can be used with splitter and tree methods. 49 | cdef intp_t init( 50 | self, 51 | const float32_t[:] feature_values, 52 | const float64_t[:] sample_weight, 53 | float64_t weighted_n_samples, 54 | const intp_t[:] samples, 55 | ) except -1 nogil 56 | 57 | cdef void init_feature_vec( 58 | self 59 | ) noexcept nogil 60 | 61 | cdef void set_sample_pointers( 62 | self, 63 | intp_t start, 64 | intp_t end 65 | ) noexcept nogil 66 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 14 | 15 | ## Checklist 16 | 17 | 18 | 19 | - [ ] I have verified that the issue exists against the `main` branch. 20 | - [ ] I have read the relevant section in the [contribution guide](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#bug-reports-and-feature-requests) on reporting bugs. 21 | - [ ] I have checked the [issues list](https://github.com/py-why/pywhy-graphs/issues) for similar or identical bug reports. 22 | - [ ] I have checked the [pull requests list](https://github.com/py-why/pywhy-graphs/pulls) for existing proposed fixes. 23 | - [ ] I have checked the [CHANGELOG](https://github.com/py-why/pywhy-graphs/blob/main/CHANGELOG.md) and the [commit log](https://github.com/py-why/pywhy-graphs/commits/main) to find out if the bug was already fixed in the main branch. 24 | - [ ] I have included in the "Description" section below a traceback from any exceptions related to this bug. 25 | - [ ] I have included in the "Related issues or possible duplicates" section beloew all related issues and possible duplicate issues (If there are none, check this box anyway). 26 | - [ ] I have included in the "Environment" section below the name of the operating system and Python version that I was using when I discovered this bug. 27 | - [ ] I have included in the "Environment" section below the output of `pip freeze`. 28 | - [ ] I have included in the "Steps to reproduce" section below a minimally reproducible example. 29 | 30 | 31 | ## Description 32 | 33 | 34 | 35 |
36 | Python traceback: 37 |

38 | 39 | 40 | ``` 41 | ``` 42 | 43 |

44 |
45 | 46 | 47 | ## Related issues or possible duplicates 48 | 49 | - None 50 | 51 | 52 | ## Environment 53 | 54 | 55 | OS: 56 | 57 | 58 | Python version: 59 | 60 |
61 | Output of pip freeze: 62 |

63 | 64 | 65 | ``` 66 | ``` 67 | 68 |

69 |
70 | 71 | 72 | ## Steps to reproduce 73 | 74 | 75 |
76 | Example source: 77 |

78 | 79 | 80 | ``` 81 | ``` 82 | 83 |

84 |
85 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # simple makefile to simplify repetitive build env management tasks under posix 2 | 3 | # caution: testing won't work on windows, see README 4 | 5 | PYTHON ?= python 6 | PYTESTS ?= pytest 7 | CTAGS ?= ctags 8 | CODESPELL_SKIPS ?= "*.fif,*.eve,*.gz,*.tgz,*.zip,*.mat,*.stc,*.label,*.w,*.bz2,*.annot,*.sulc,*.log,*.local-copy,*.orig_avg,*.inflated_avg,*.gii,*.pyc,*.doctree,*.pickle,*.inv,*.png,*.edf,*.touch,*.thickness,*.nofix,*.volume,*.defect_borders,*.mgh,lh.*,rh.*,COR-*,FreeSurferColorLUT.txt,*.examples,.xdebug_mris_calc,bad.segments,BadChannels,*.hist,empty_file,*.orig,*.js,*.map,*.ipynb,searchindex.dat,plot_*.rst,*.rst.txt,*.html,gdf_encodes.txt,treeple/_lib/*,doc/auto_examples/*" 9 | CODESPELL_DIRS ?= treeple/ doc/ examples/ benchmarks/ 10 | all: clean inplace test test-doc 11 | 12 | clean-pyc: 13 | find . -name "*.pyc" | xargs rm -f 14 | 15 | clean-build: 16 | rm -rf build 17 | rm -rf dist 18 | 19 | clean-cache: 20 | find . -name "__pycache__" | xargs rm -rf 21 | 22 | clean: clean-build clean-pyc clean-cache 23 | 24 | pytest: test 25 | 26 | test: in 27 | rm -f .coverage 28 | $(PYTESTS) treeple 29 | 30 | test-doc: sample_data testing_data 31 | $(PYTESTS) --doctest-modules --doctest-ignore-import-errors --doctest-glob='*.rst' ./doc/ 32 | 33 | flake: 34 | @if command -v flake8 > /dev/null; then \ 35 | echo "Running flake8"; \ 36 | flake8 --count treeple examples; \ 37 | else \ 38 | echo "flake8 not found, please install it!"; \ 39 | exit 1; \ 40 | fi; 41 | @echo "flake8 passed" 42 | 43 | black: 44 | @if command -v black > /dev/null; then \ 45 | echo "Running black"; \ 46 | black treeple examples; \ 47 | else \ 48 | echo "black not found, please install it!"; \ 49 | exit 1; \ 50 | fi; 51 | @echo "black passed" 52 | 53 | isort: 54 | @if command -v isort > /dev/null; then \ 55 | echo "Running isort"; \ 56 | isort treeple examples doc; \ 57 | else \ 58 | echo "isort not found, please install it!"; \ 59 | exit 1; \ 60 | fi; 61 | @echo "isort passed" 62 | 63 | codespell: # running manually 64 | @codespell -w -i 3 -q 3 -S $(CODESPELL_SKIPS) --ignore-words=.codespellignore $(CODESPELL_DIRS) 65 | 66 | codespell-error: # running on travis 67 | @codespell -i 0 -q 7 -S $(CODESPELL_SKIPS) --ignore-words=.codespellignore $(CODESPELL_DIRS) 68 | 69 | pydocstyle: 70 | @echo "Running pydocstyle" 71 | @pydocstyle mne 72 | 73 | docstyle: pydocstyle 74 | 75 | build-doc: 76 | @echo "Building documentation" 77 | make -C doc/ clean 78 | make -C doc/ html 79 | cd doc/ && make view 80 | 81 | build-doc-noplot: 82 | @echo "Building documentation" 83 | make -C doc/ clean 84 | make -C doc/ html-noplot 85 | cd doc/ && make view 86 | 87 | run-checks: 88 | isort --check . 89 | black --check treeple examples 90 | flake8 . 91 | mypy ./treeple 92 | @$(MAKE) pydocstyle 93 | @$(MAKE) codespell-error 94 | ruff . 95 | toml-sort ./pyproject.toml --check 96 | yamllint . -c .yamllint.yml --strict 97 | 98 | pre-commit: 99 | @pre-commit run -a -------------------------------------------------------------------------------- /doc/sphinxext/github_link.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import subprocess 4 | import sys 5 | from functools import partial 6 | from operator import attrgetter 7 | 8 | REVISION_CMD = "git rev-parse --short HEAD" 9 | 10 | 11 | def _get_git_revision(): 12 | try: 13 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 14 | except (subprocess.CalledProcessError, OSError): 15 | print("Failed to execute git to get revision") 16 | return None 17 | return revision.decode("utf-8") 18 | 19 | 20 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 21 | """Determine a link to online source for a class/method/function 22 | 23 | This is called by sphinx.ext.linkcode 24 | 25 | An example with a long-untouched module that everyone has 26 | >>> _linkcode_resolve('py', {'module': 'tty', 27 | ... 'fullname': 'setraw'}, 28 | ... package='tty', 29 | ... url_fmt='http://hg.python.org/cpython/file/' 30 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 31 | ... revision='xxxx') 32 | 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 33 | """ 34 | 35 | if revision is None: 36 | return 37 | if domain not in ("py", "pyx"): 38 | return 39 | if not info.get("module") or not info.get("fullname"): 40 | return 41 | 42 | class_name = info["fullname"].split(".")[0] 43 | module = __import__(info["module"], fromlist=[class_name]) 44 | obj = attrgetter(info["fullname"])(module) 45 | 46 | # Unwrap the object to get the correct source 47 | # file in case that is wrapped by a decorator 48 | obj = inspect.unwrap(obj) 49 | 50 | try: 51 | fn = inspect.getsourcefile(obj) 52 | except Exception: 53 | fn = None 54 | if not fn: 55 | try: 56 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 57 | except Exception: 58 | fn = None 59 | if not fn: 60 | return 61 | 62 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) 63 | try: 64 | lineno = inspect.getsourcelines(obj)[1] 65 | except Exception: 66 | lineno = "" 67 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) 68 | 69 | 70 | def make_linkcode_resolve(package, url_fmt): 71 | """Returns a linkcode_resolve function for the given URL format 72 | 73 | revision is a git commit reference (hash or name) 74 | 75 | package is the name of the root module of the package 76 | 77 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 78 | 'blob/{revision}/{package}/' 79 | '{path}#L{lineno}') 80 | """ 81 | revision = _get_git_revision() 82 | return partial(_linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt) 83 | -------------------------------------------------------------------------------- /doc/install.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | Installation 4 | ============ 5 | 6 | Dependencies 7 | ------------ 8 | 9 | * ``numpy`` (>=1.23) 10 | * ``scipy`` (>=1.5.0) 11 | * ``scikit-learn`` (>=1.3) 12 | * ``joblib`` (>=1.0.0) 13 | * ``matplotlib`` (optional) 14 | 15 | **treeple** supports Python >= 3.9. 16 | 17 | Installing with ``pip`` 18 | ----------------------- 19 | 20 | **treeple** is available on `PyPI `_. Just run 21 | 22 | .. code-block:: bash 23 | 24 | pip install treeple 25 | 26 | Installing from source with Meson 27 | --------------------------------- 28 | 29 | To install **treeple** from source, first clone the `repository `_: 30 | 31 | .. code-block:: bash 32 | 33 | git clone https://github.com/neurodata/treeple.git 34 | cd treeple 35 | 36 | # ideally, you should always start within a virtual environment 37 | conda create -n sklearn-dev python=3.9 38 | conda activate sklearn-dev 39 | 40 | Then run installation of build packages 41 | 42 | .. code-block:: bash 43 | 44 | pip install -r build_requirements.txt 45 | pip install spin 46 | 47 | # use spin CLI to run Meson build locally 48 | ./spin build -j 2 49 | 50 | # you can now run tests 51 | ./spin test 52 | 53 | via pip, you will be able to install in editable mode (pending Meson-Python support). 54 | 55 | .. code-block:: bash 56 | 57 | pip install -e . 58 | 59 | # if editing Cython files 60 | pip install --verbose --no-build-isolation --editable . 61 | 62 | .. code-block:: bash 63 | 64 | pip install --user -U https://api.github.com/repos/neurodata/treeple/zipball/master 65 | 66 | Conda (Recommended) 67 | ------------------- 68 | First, create a virtual environment using Conda. 69 | 70 | conda create -n sklearn-dev python=3.9 71 | 72 | # activate the virtual environment and install necessary packages to build from source 73 | 74 | conda activate sklearn-dev 75 | conda install -c conda-forge numpy scipy cython joblib threadpoolctl pytest compilers llvm-openmp 76 | 77 | Next, `treeple` from source: 78 | 79 | pip install .[build] 80 | 81 | # if editing Cython files 82 | pip install --verbose --no-build-isolation --editable . 83 | 84 | To install the package from github, clone the repository and then `cd` into the directory.: 85 | 86 | ./spin build 87 | 88 | # if you would like an editable install of treeple for dev purposes 89 | pip install --verbose --no-build-isolation --editable . 90 | 91 | pip install https://api.github.com/repos/neurodata/treeple/zipball/main 92 | 93 | 94 | pip install https://api.github.com/repos/neurodata/scikit-learn/zipball/obliquepr 95 | 96 | Note that currently, we need to build the development version of scikit-learn with oblique trees within this `PR `_. 97 | 98 | Checkout this PR code, and build from source, using scikit-learn's build from source page instructions. 99 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | **treeple** 2 | =================== 3 | treeple is a package for modern tree-based algorithms for supervised and unsupervised 4 | learning problems. It extends the robust API of `scikit-learn `_ 5 | for tree algorithms that achieve strong performance in benchmark tasks. 6 | 7 | Our package has implemented unsupervised forests (Geodesic Forests 8 | [Madhyastha2020]_), oblique random forests (SPORF [Tomita2020]_, manifold random forests, 9 | MORF [Li2023]_), honest forests [Perry2021]_, extended isolation forests [Hariri2019]_, and more. 10 | 11 | For all forests, we also support incremental building of the forests, using the 12 | ``partial_fit`` API from scikit-learn [Xu2022]_, and quantile regression by storing 13 | the training samples in the leaves of the trees [Meinshausen2006]_ (Warning: high memory usage 14 | will occur in this setting since predicting quantiles stores the training data within the 15 | leaves of the tree). 16 | 17 | We encourage you to use the package for your research and also build on top 18 | with relevant Pull Requests. See our examples for walk-throughs of how to use the package. 19 | Also, see our `contributing guide `_. 20 | 21 | We are licensed under PolyForm Noncommercial License (see `License `_). 22 | 23 | .. topic:: References 24 | 25 | .. [Hariri2019] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner. 26 | "Extended isolation forest." IEEE transactions on knowledge and data 27 | engineering 33.4 (2019): 1479-1489. 28 | 29 | .. [Meinshausen2006] Meinshausen, Nicolai, and Greg Ridgeway. "Quantile regression forests." 30 | Journal of machine learning research 7.6 (2006). "Quantile regression forests." 31 | 32 | .. [Madhyastha2020] Madhyastha, Meghana, et al. :doi:`"Geodesic Forests" 33 | <10.1145/3394486.3403094>`, KDD 2020, 513-523, 2020. 34 | 35 | .. [Tomita2020] Tomita, Tyler M., et al. "Sparse Projection Oblique 36 | Randomer Forests", The Journal of Machine Learning Research, 21(104), 37 | 1-39, 2020. 38 | 39 | .. [Li2023] Li, Adam, et al. :doi:`"Manifold Oblique Random Forests: Towards 40 | Closing the Gap on Convolutional Deep Networks" <10.1137/21M1449117>`, 41 | SIAM Journal on Mathematics of Data Science, 5(1), 77-96, 2023. 42 | 43 | .. [Perry2021] Perry, Ronan, et al. :arxiv:`"Random Forests for Adaptive 44 | Nearest Neighbor Estimation of Information-Theoretic Quantities" 45 | <1907.00325>`, arXiv preprint arXiv:1907.00325, 2021. 46 | 47 | .. [Xu2022] Xu, Haoyin, et al. :arxiv:`"Simplest Streaming Trees" 48 | <2110.08483>`, arXiv preprint arXiv:2110.08483, 2022. 49 | 50 | Contents 51 | -------- 52 | 53 | .. toctree:: 54 | :maxdepth: 2 55 | :caption: Getting started: 56 | 57 | api 58 | User Guide 59 | whats_new 60 | install 61 | use 62 | 63 | Indices and tables 64 | ------------------ 65 | 66 | * :ref:`genindex` 67 | * :ref:`modindex` 68 | -------------------------------------------------------------------------------- /doc/modules/unsupervised_tree.rst: -------------------------------------------------------------------------------- 1 | .. _unsupervised_tree: 2 | 3 | =========================== 4 | Unsupervised Decision Trees 5 | =========================== 6 | 7 | .. currentmodule:: sklearn.tree 8 | 9 | In unsupervised learning, the goal is to identify patterns 10 | or structure in data without using labeled examples. Clustering is a common 11 | unsupervised learning technique that groups similar examples together 12 | based on their features. Unsupervised tree models are an adaptive way of generating 13 | clusters of samples. For information on supervised tree models, see :ref:`supervised_tree` 14 | 15 | In this guide, we overview the :ref:`unsup_criterion` used for splitting unsupervised trees, 16 | and methods for evaluating the quality of the tree model in :ref:`unsup_evaluation`. 17 | 18 | .. _unsup_criterion: 19 | 20 | Unsupervised Criterion 21 | ---------------------- 22 | 23 | Unsupervised tree models use a variety of criteria to split nodes. 24 | 25 | Two-Means 26 | ~~~~~~~~~ 27 | 28 | The two means split finds the cutpoint that minimizes the one-dimensional 29 | 2-means objective, which is finding the cutoff point where the total variance 30 | from cluster 1 and cluster 2 are minimal. 31 | 32 | .. math:: 33 | \min_s \sum_{i=1}^s (x_i - \hat{\mu}_1)^2 + \sum_{i=s+1}^N (x_i - \hat{\mu}_2)^2 34 | 35 | where x is a N-dimensional feature vector, N is the number of sample_indices and 36 | the \mu terms are the estimated means of each cluster 1 and 2. 37 | 38 | Fast-BIC 39 | ~~~~~~~~ 40 | 41 | The Bayesian Information Criterion (BIC) is a popular model seleciton 42 | criteria that is based on the log likelihood of the model given data. 43 | Fast-BIC :footcite:`Meghana2019_geodesicrf` is a method that combines the speed of the 44 | :class:`sklearn.cluster.KMeans` clustering method with the model flexibility 45 | of Mclust-BIC. It sorts data for each feature and tries all possible splits to 46 | assign data points to one of two Gaussian distributions based on their position 47 | relative to the split. 48 | The parameters for each cluster are estimated using maximum likelihood 49 | estimation (MLE).The method performs hard clustering rather than soft 50 | clustering like in GMM, resulting in a simpler calculation of the likelihood. 51 | 52 | .. math:: 53 | 54 | \hat{L} = \sum_{n=1}^s[\log\hat{\pi}_1+\log{\mathcal{N}(x_n;\hat{\mu}_1,\hat{\sigma}_1^2)}] 55 | + \sum_{n=s+1}^N[\log\hat{\pi}_2+\log{\mathcal{N}(x_n;\hat{\mu}_2,\hat{\sigma}_2^2)}] 56 | 57 | where the prior, mean, and variance are defined as follows, respectively: 58 | 59 | .. math:: 60 | 61 | \hat{\pi} = \frac{s}{N},\quad\quad 62 | \hat{\mu} = \frac{1}{s}\sum_{n\le s}{x_n},\quad\quad 63 | \hat{\sigma}^2 = \frac{1}{s}\sum_{n\le s}{||x_n-\hat{\mu_j}||^2} 64 | 65 | .. _unsup_evaluation: 66 | 67 | Evaluating Unsupervised Trees 68 | ----------------------------- 69 | 70 | In clustering settings, there may be no natural 71 | notion of “true” class-labels, thus the efficacy of the clustering scheme is 72 | often measured based on distance based metrics such as :func:`sklearn.metrics.adjusted_rand_score`. 73 | 74 | .. topic:: References 75 | 76 | .. footbibliography:: 77 | -------------------------------------------------------------------------------- /doc/whats_new/v0.7.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | .. include:: _contributors.rst 4 | .. currentmodule:: treeple 5 | 6 | .. _v0_7: 7 | 8 | Version 0.7 9 | =========== 10 | 11 | This release adds the ability to separate in-bag and out-of-bag samples for 12 | any forest model. We also introduce a new class for fitting honest forests while 13 | permuting the covariate index, and a new set of simulations based on Marron and Wand 1992. 14 | 15 | In addition, various patches were made in terms of how to use scikit-tree for hypothesis 16 | testing of feature sets. 17 | 18 | Changelog 19 | --------- 20 | 21 | - |Feature| Introduce a new light-weight class for fitting honest forests while 22 | permuting the covariate index :class:`treeple.stats.PermutationHonestForestClassifier`, 23 | by `Adam Li`_ (:pr:`#211`) 24 | - |Feature| Introduce a new class method ``predict_proba_per_tree`` for all 25 | Forest classifiers, which will predict the probability per tree and keep the 26 | output as a ``(n_estimators, n_samples, n_classes)`` output, 27 | by `Adam Li`_ (:pr:`#211`) 28 | - |Feature| Introduce a new class fitted attribute ``oob_samples_`` for all 29 | Forest models, which will keep track of the samples used. 30 | by `Adam Li`_ (:pr:`#211`) 31 | - |Feature| Introduce a new set of simulations based on Marron and Wand 1992. 32 | by `Sambit Panda`_ (:pr:`#203`) 33 | - |Feature| :func:`treeple.stats.build_coleman_forest` and :func:`treeple.stats.build_permutation_forest` 34 | are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`) 35 | - |API| :func:`treeple.datasets.make_trunk_classification` for generating trunk mixture and Marron-Wand 36 | simulations are separated out into :func:`treeple.datasets.make_marron_wand_classification` and 37 | :func:`treeple.datasets.make_trunk_mixture_classification`, `Adam Li`_ (:pr:`#227`) 38 | - |API| :class:`treeple.HonestForestClassifier` and :class:`treeple.tree.HonestTreeClassifier` 39 | now overwrite all parameters set by the underlying ``tree_estimator`` and allow you to directly 40 | pass any extra parameters that ``tree_estimator`` has compared to the original 41 | :class:`~sklearn.tree.DecisionTreeClassifier`, `Adam Li`_ (:pr:`#228`) 42 | - |Fix| Trunk simulators now correctly generate random values with a fixed seed, 43 | by `Sambit Panda`_ (:pr:`#236`) 44 | - |Fix| Trunk simulators now correctly generate random values with a fixed seed, 45 | by `Sambit Panda`_ (:pr:`#236`) 46 | - |Efficiency| All scikit-tree estimators are now at least 2X faster than they were 47 | in previous versions. This was due to adding in compiler-directives to turn on 48 | optimizations '-03' when compiling the C++ generated code from Cython. In addition, 49 | we explicitly turned off bounds-checking and related runtime checks in the Cython code, 50 | which would lead to performance degradation during runtime. by `Adam Li`_ (:pr:`#242`) 51 | 52 | Code and Documentation Contributors 53 | ----------------------------------- 54 | 55 | Thanks to everyone who has contributed to the maintenance and improvement of 56 | the project since version inception, including: 57 | 58 | * `Adam Li`_ 59 | * `Sambit Panda`_ 60 | -------------------------------------------------------------------------------- /treeple/tree/unsupervised/_unsup_tree.pxd: -------------------------------------------------------------------------------- 1 | # Authors: Adam Li 2 | # Jong Shin 3 | # 4 | 5 | # License: BSD 3 clause 6 | 7 | # See _unsup_tree.pyx for details. 8 | 9 | import numpy as np 10 | 11 | cimport numpy as cnp 12 | 13 | from ..._lib.sklearn.tree._splitter cimport SplitRecord 14 | from ..._lib.sklearn.tree._tree cimport BaseTree, Node, ParentInfo 15 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t 16 | from ._unsup_splitter cimport UnsupervisedSplitter 17 | 18 | 19 | # TODO: copy changes from https://github.com/scikit-learn/scikit-learn/pull/25540/files 20 | cdef class UnsupervisedTree(BaseTree): 21 | # The Tree object is a binary tree structure constructed by the 22 | # TreeBuilder. The tree structure is used for predictions and 23 | # feature importances. 24 | # 25 | # Inner structures: values are stored separately from node structure, 26 | # since size is determined at runtime. 27 | # cdef float64_t* value # (capacity) array of values 28 | # cdef intp_t value_stride # = 1 29 | 30 | # Input/Output layout 31 | cdef public intp_t n_features # Number of features in X 32 | 33 | # Methods 34 | cdef cnp.ndarray _get_value_ndarray(self) 35 | cdef cnp.ndarray _get_node_ndarray(self) 36 | 37 | # Overridden Methods 38 | cdef int _set_split_node( 39 | self, 40 | SplitRecord* split_node, 41 | Node* node, 42 | intp_t node_id 43 | ) except -1 nogil 44 | cdef float32_t _compute_feature( 45 | self, 46 | const float32_t[:, :] X_ndarray, 47 | intp_t sample_index, 48 | Node *node 49 | ) noexcept nogil 50 | cdef void _compute_feature_importances( 51 | self, 52 | cnp.float64_t[:] importances, 53 | Node* node 54 | ) noexcept nogil 55 | 56 | # ============================================================================= 57 | # Tree builder 58 | # ============================================================================= 59 | 60 | cdef class UnsupervisedTreeBuilder: 61 | # The TreeBuilder recursively builds a Tree object from training samples, 62 | # using a Splitter object for splitting internal nodes and assigning 63 | # values to leaves. 64 | # 65 | # This class controls the various stopping criteria and the node splitting 66 | # evaluation order, e.g. depth-first or best-first. 67 | 68 | cdef UnsupervisedSplitter splitter # Splitting algorithm 69 | 70 | cdef intp_t min_samples_split # Minimum number of samples in an internal node 71 | cdef intp_t min_samples_leaf # Minimum number of samples in a leaf 72 | cdef float64_t min_weight_leaf # Minimum weight in a leaf 73 | cdef intp_t max_depth # Maximal tree depth 74 | cdef float64_t min_impurity_decrease # Impurity threshold for early stopping 75 | 76 | cpdef build( 77 | self, 78 | UnsupervisedTree tree, 79 | object X, 80 | const float64_t[:] sample_weight=* 81 | ) 82 | cdef _check_input( 83 | self, 84 | object X, 85 | const float64_t[:] sample_weight 86 | ) 87 | -------------------------------------------------------------------------------- /treeple/tree/tests/test_honest_prune.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from treeple.tree import HonestTreeClassifier 4 | 5 | 6 | def test_honest_tree_pruning(): 7 | """Test honest tree with pruning to ensure no empty leaves.""" 8 | rng = np.random.default_rng(1234) 9 | 10 | n_samples = 1000 11 | X = rng.standard_normal(size=(n_samples, 100)) 12 | X[n_samples // 2 :] *= -1 13 | y = [0] * (n_samples // 2) + [1] * (n_samples // 2) 14 | 15 | clf = HonestTreeClassifier(honest_method="prune", max_features="sqrt", random_state=0) 16 | clf = clf.fit(X, y) 17 | 18 | nonprune_clf = HonestTreeClassifier( 19 | honest_method="apply", max_features="sqrt", random_state=0, honest_prior="ignore" 20 | ) 21 | nonprune_clf = nonprune_clf.fit(X, y) 22 | 23 | assert ( 24 | nonprune_clf.tree_.max_depth >= clf.tree_.max_depth 25 | ), f"{nonprune_clf.tree_.max_depth} <= {clf.tree_.max_depth}" 26 | # assert np.all(clf.tree_.children_left != -1) 27 | 28 | # Access the original and pruned trees' attributes 29 | original_tree = nonprune_clf.tree_ 30 | pruned_tree = clf.tree_ 31 | 32 | # Ensure the pruned tree has fewer or equal nodes 33 | assert ( 34 | pruned_tree.node_count < original_tree.node_count 35 | ), "Pruned tree has more nodes than the original tree" 36 | 37 | # Ensure the pruned tree has no empty leaves 38 | assert np.all(pruned_tree.value.sum(axis=(1, 2)) > 0), pruned_tree.value.sum(axis=(1, 2)) 39 | # assert np.all(original_tree.value.sum(axis=(1,2)) > 0), original_tree.value.sum(axis=(1,2)) 40 | assert np.all(pruned_tree.value.sum(axis=(1, 2)) > 0) > np.all( 41 | original_tree.value.sum(axis=(1, 2)) > 0 42 | ) 43 | 44 | # test that the first three nodes are the same, since these are unlikely to be 45 | # pruned, and should remain invariant. 46 | # 47 | # Note: pruning the tree will have the node_ids change since the tree is 48 | # ordered via DFS. 49 | for pruned_node_id in range(3): 50 | pruned_left_child = pruned_tree.children_left[pruned_node_id] 51 | pruned_right_child = pruned_tree.children_right[pruned_node_id] 52 | 53 | # Check if the pruned node exists in the original tree 54 | assert ( 55 | pruned_left_child in original_tree.children_left 56 | ), "Left child node of pruned tree not found in original tree" 57 | assert ( 58 | pruned_right_child in original_tree.children_right 59 | ), "Right child node of pruned tree not found in original tree" 60 | 61 | # Check if the node's parameters match for non-leaf nodes 62 | if pruned_left_child != -1: 63 | assert ( 64 | pruned_tree.feature[pruned_node_id] == original_tree.feature[pruned_node_id] 65 | ), "Feature does not match for node {}".format(pruned_node_id) 66 | assert ( 67 | pruned_tree.threshold[pruned_node_id] == original_tree.threshold[pruned_node_id] 68 | ), "Threshold does not match for node {}".format(pruned_node_id) 69 | assert ( 70 | pruned_tree.weighted_n_node_samples[pruned_node_id] 71 | == original_tree.weighted_n_node_samples[pruned_node_id] 72 | ), "Weighted n_node samples does not match for node {}".format(pruned_node_id) 73 | -------------------------------------------------------------------------------- /treeple/tests/test_neighbors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.datasets import make_blobs, make_classification 4 | from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier 5 | from sklearn.neighbors import NearestNeighbors 6 | from sklearn.tree import ( 7 | DecisionTreeClassifier, 8 | DecisionTreeRegressor, 9 | ExtraTreeClassifier, 10 | ExtraTreeRegressor, 11 | ) 12 | from sklearn.utils.estimator_checks import parametrize_with_checks 13 | 14 | from treeple.ensemble import ( 15 | ObliqueRandomForestClassifier, 16 | PatchObliqueRandomForestClassifier, 17 | UnsupervisedObliqueRandomForest, 18 | UnsupervisedRandomForest, 19 | ) 20 | from treeple.neighbors import NearestNeighborsMetaEstimator 21 | 22 | FORESTS = [ 23 | ObliqueRandomForestClassifier, 24 | PatchObliqueRandomForestClassifier, 25 | UnsupervisedRandomForest, 26 | UnsupervisedObliqueRandomForest, 27 | ] 28 | 29 | 30 | @pytest.mark.parametrize("forest", FORESTS) 31 | def test_similarity_matrix(forest): 32 | n_samples = 200 33 | n_classes = 2 34 | n_features = 5 35 | 36 | X, y = make_blobs( 37 | n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=12345 38 | ) 39 | 40 | clf = forest(random_state=12345) 41 | clf.fit(X, y) 42 | sim_mat = clf.compute_similarity_matrix(X) 43 | 44 | assert sim_mat.shape == (n_samples, n_samples) 45 | assert np.allclose(sim_mat, sim_mat.T) 46 | assert np.all((sim_mat.diagonal() == 1)) 47 | 48 | 49 | @pytest.fixture 50 | def sample_data(): 51 | # Generate sample data for testing 52 | X, y = make_classification(n_samples=100, n_features=10, random_state=42) 53 | return X, y 54 | 55 | 56 | @pytest.mark.parametrize( 57 | "estimator", 58 | [ 59 | DecisionTreeClassifier(random_state=0), 60 | DecisionTreeRegressor(random_state=0), 61 | ExtraTreeClassifier(random_state=0), 62 | ExtraTreeRegressor(random_state=0), 63 | RandomForestClassifier(random_state=0, n_estimators=10), 64 | ExtraTreesClassifier(random_state=0, n_estimators=10), 65 | ], 66 | ) 67 | def test_nearest_neighbors_meta_estimator(sample_data, estimator): 68 | X, y = sample_data 69 | estimator.fit(X, y) 70 | 71 | meta_estimator = NearestNeighborsMetaEstimator(estimator) 72 | 73 | # Fit the meta-estimator 74 | meta_estimator.fit(X, y) 75 | 76 | # Test the fitted estimator attribute 77 | assert hasattr(meta_estimator, "estimator_") 78 | 79 | # Test the nearest neighbors estimator 80 | assert isinstance(meta_estimator.neigh_est_, NearestNeighbors) 81 | 82 | # Test the kneighbors method 83 | neigh_dist, neigh_ind = meta_estimator.kneighbors() 84 | assert neigh_dist.shape == (X.shape[0], meta_estimator.n_neighbors) 85 | assert neigh_ind.shape == (X.shape[0], meta_estimator.n_neighbors) 86 | 87 | # Test the radius_neighbors method 88 | neigh_dist, neigh_ind = meta_estimator.radius_neighbors(radius=0.5) 89 | assert neigh_dist.shape == (X.shape[0],) 90 | assert neigh_ind.shape == (X.shape[0],) 91 | 92 | 93 | @parametrize_with_checks( 94 | [ 95 | NearestNeighborsMetaEstimator(DecisionTreeClassifier(random_state=0)), 96 | ] 97 | ) 98 | def test_sklearn_compatible_transformer(estimator, check): 99 | check(estimator) 100 | -------------------------------------------------------------------------------- /examples/treeple/treeple_tutorial_1_1d_HD.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================== 3 | Calculating Hellinger Distance 4 | ============================== 5 | """ 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import seaborn as sns 10 | 11 | from treeple.datasets import make_trunk_classification 12 | from treeple.ensemble import HonestForestClassifier 13 | from treeple.stats import build_oob_forest 14 | 15 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) 16 | PALETTE = sns.color_palette("Set1") 17 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9) 18 | sns.set_style("white", {"axes.edgecolor": "#dddddd"}) 19 | 20 | # %% 21 | # Hellinger Distance 22 | # ------------------ 23 | # 24 | # Hellinger distance quantifies the similarity between the two posterior 25 | # probability distributions (class zero and class one). 26 | # 27 | # .. math:: H(\eta(X), 1-\eta(X)) = \frac{1}{\sqrt{2}} \; \bigl\|\sqrt{\eta(X)} - \sqrt{1-\eta(X)} \bigr\|_2 28 | # 29 | # With a binary class simulation as an example, this tutorial will show 30 | # how to use ``treeple`` to calculate the statistic. 31 | 32 | # %% 33 | # Create a simulation with two gaussians 34 | # -------------------------------------- 35 | 36 | 37 | # create a binary class simulation with two gaussians 38 | # 500 samples for each class, class zero is standard 39 | # gaussian, and class one has a mean at one 40 | X, y = make_trunk_classification( 41 | n_samples=1000, 42 | n_dim=1, 43 | mu_0=0, 44 | mu_1=1, 45 | n_informative=1, 46 | seed=1, 47 | ) 48 | 49 | 50 | fig, ax = plt.subplots(figsize=(6, 6)) 51 | fig.tight_layout() 52 | ax.tick_params(labelsize=15) 53 | 54 | # histogram plot the samples 55 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative") 56 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive") 57 | ax.set_xlabel("Variable One", fontsize=15) 58 | ax.set_ylabel("Likelihood", fontsize=15) 59 | plt.legend(frameon=False, fontsize=15) 60 | plt.show() 61 | 62 | # %% 63 | # Fit the model 64 | # ------------- 65 | 66 | 67 | # initialize the forest with 100 trees 68 | est = HonestForestClassifier( 69 | n_estimators=100, 70 | max_samples=1.6, 71 | max_features=0.3, 72 | bootstrap=True, 73 | stratify=True, 74 | random_state=1, 75 | ) 76 | 77 | # fit the model and obtain the tree posteriors 78 | _, observe_proba = build_oob_forest(est, X, y) 79 | 80 | # generate forest posteriors for the two classes 81 | observe_proba = np.nanmean(observe_proba, axis=0) 82 | 83 | 84 | fig, ax = plt.subplots(figsize=(6, 6)) 85 | fig.tight_layout() 86 | ax.tick_params(labelsize=15) 87 | 88 | # histogram plot the posterior probabilities for class one 89 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative") 90 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive") 91 | ax.set_ylabel("# of Samples", fontsize=15) 92 | ax.set_xlabel("Class One Posterior", fontsize=15) 93 | plt.legend(frameon=False, fontsize=15) 94 | plt.show() 95 | 96 | # %% 97 | # Calculate the statistic 98 | # ----------------------- 99 | 100 | 101 | def Calculate_hd(y_pred_proba) -> float: 102 | return np.sqrt( 103 | np.sum((np.sqrt(y_pred_proba[:, 1]) - np.sqrt(y_pred_proba[:, 0])) ** 2) 104 | ) / np.sqrt(2) 105 | 106 | 107 | hd = Calculate_hd(observe_proba) 108 | print("Hellinger distance =", round(hd, 2)) 109 | -------------------------------------------------------------------------------- /examples/sklearn_vs_treeple/plot_iris_dtc.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================================================= 3 | Plot the decision surface of decision trees trained on the iris dataset 4 | ======================================================================= 5 | 6 | Plot the decision surface of a decision tree and oblique decision tree 7 | trained on pairs of features of the iris dataset. 8 | 9 | See :ref:`decision tree ` for more information on the estimators. 10 | 11 | For each pair of iris features, the decision tree learns axis-aligned decision 12 | boundaries made of combinations of simple thresholding rules inferred from 13 | the training samples. The oblique decision tree learns oblique decision boundaries 14 | made from linear combinations of the features in the training samples and then 15 | the same thresholding rule as regular decision trees. 16 | 17 | We also show the tree structure of a model built on all of the features. 18 | """ 19 | 20 | import matplotlib.pyplot as plt 21 | import numpy as np 22 | from sklearn.datasets import load_iris 23 | from sklearn.inspection import DecisionBoundaryDisplay 24 | 25 | from treeple._lib.sklearn.tree import DecisionTreeClassifier, plot_tree 26 | from treeple.tree import ObliqueDecisionTreeClassifier 27 | 28 | # %% 29 | # First load the copy of the Iris dataset shipped with scikit-learn: 30 | iris = load_iris() 31 | 32 | # Parameters 33 | n_classes = 3 34 | plot_colors = "ryb" 35 | plot_step = 0.02 36 | 37 | clf_labels = ["Axis-aligned", "Oblique"] 38 | random_state = 123456 39 | 40 | clfs = [ 41 | DecisionTreeClassifier(random_state=random_state), 42 | ObliqueDecisionTreeClassifier(random_state=random_state), 43 | ] 44 | 45 | for clf, clf_label in zip(clfs, clf_labels): 46 | fig, axes = plt.subplots(2, 3) 47 | axes = axes.flatten() 48 | 49 | for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): 50 | # We only take the two corresponding features 51 | X = iris.data[:, pair] 52 | y = iris.target 53 | 54 | # Train 55 | clf.fit(X, y) 56 | 57 | # Plot the decision boundary 58 | ax = axes[pairidx] 59 | plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5) 60 | DecisionBoundaryDisplay.from_estimator( 61 | clf, 62 | X, 63 | cmap=plt.cm.RdYlBu, 64 | response_method="predict", 65 | ax=ax, 66 | xlabel=iris.feature_names[pair[0]], 67 | ylabel=iris.feature_names[pair[1]], 68 | ) 69 | 70 | # Plot the training points 71 | for i, color in zip(range(n_classes), plot_colors): 72 | idx = np.where(y == i) 73 | ax.scatter( 74 | X[idx, 0], 75 | X[idx, 1], 76 | c=color, 77 | label=iris.target_names[i], 78 | cmap=plt.cm.RdYlBu, 79 | edgecolor="black", 80 | s=15, 81 | ) 82 | 83 | fig.suptitle(f"Decision surface of {clf_label} decision trees trained on pairs of features") 84 | plt.legend(loc="lower right", borderpad=0, handletextpad=0) 85 | _ = plt.axis("tight") 86 | plt.show() 87 | 88 | # %% 89 | # Display the structure of a single decision tree trained on all the features 90 | # together. 91 | 92 | for clf, clf_label in zip(clfs, clf_labels): 93 | plt.figure() 94 | clf.fit(iris.data, iris.target) 95 | plot_tree(clf, filled=True) 96 | plt.title(f"{clf_label} decision tree trained on all the iris features") 97 | plt.show() 98 | -------------------------------------------------------------------------------- /.github/workflows/pr_checks.yml: -------------------------------------------------------------------------------- 1 | name: "PR Checks" 2 | 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.ref }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | pull_request: 9 | branches: 10 | - main 11 | paths: 12 | - "treeple/**" 13 | 14 | jobs: 15 | changelog: 16 | name: CHANGELOG 17 | runs-on: ubuntu-latest 18 | # if: github.event_name == 'pull_request' 19 | if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} 20 | steps: 21 | - name: Get PR number and milestone 22 | run: | 23 | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV 24 | echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV 25 | - uses: actions/checkout@v4 26 | with: 27 | fetch-depth: "0" 28 | - name: Check that CHANGELOG has been updated 29 | run: | 30 | # If this step fails, this means you haven't updated the CHANGELOG.md 31 | # file with notes on your contribution. 32 | # git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!" 33 | set -xe 34 | changed_files=$(git diff --name-only origin/main) 35 | # Changelog should be updated only if tests have been modified 36 | if [[ ! "$changed_files" =~ tests ]] 37 | then 38 | exit 0 39 | fi 40 | all_changelogs=$(cat ./doc/whats_new/v*.rst) 41 | if [[ "$all_changelogs" =~ :pr:\`#$PR_NUMBER\` ]] 42 | then 43 | echo "Changelog has been updated." 44 | # If the pull request is milestoned check the correspondent changelog 45 | if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst 46 | then 47 | expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst) 48 | if [[ "$expected_changelog" =~ :pr:\`#$PR_NUMBER\` ]] 49 | then 50 | echo "Changelog and milestone correspond." 51 | else 52 | echo "Changelog and milestone do not correspond." 53 | echo "If you see this error make sure that the tagged milestone for the PR" 54 | echo "and the edited changelog filename properly match." 55 | exit 1 56 | fi 57 | fi 58 | else 59 | echo "A Changelog entry is missing for :pr:\`#$PR_NUMBER\`" 60 | echo "" 61 | echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'" 62 | echo "to document your change assuming that the PR will be merged" 63 | echo "in time for the next release of treeple." 64 | echo "" 65 | echo "Look at other entries in that file for inspiration and please" 66 | echo "reference this pull request using the ':pr:' directive and" 67 | echo "credit yourself (and other contributors if applicable) with" 68 | echo "the ':user:' directive." 69 | echo "" 70 | echo "If you see this error and there is already a changelog entry," 71 | echo "check that the PR number is correct." 72 | echo "" 73 | echo "If you believe that this PR does not warrant a changelog" 74 | echo "entry, say so in a comment so that a maintainer will label" 75 | echo "the PR with 'No Changelog Needed' to bypass this check." 76 | exit 1 77 | fi 78 | -------------------------------------------------------------------------------- /benchmarks_nonasv/bench_plot_urf.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from time import time 3 | 4 | import numpy as np 5 | from numpy import random as nr 6 | 7 | from treeple import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest 8 | 9 | 10 | def compute_bench(samples_range, features_range): 11 | it = 0 12 | results = defaultdict(lambda: []) 13 | 14 | est_params = {"min_samples_split": 5, "criterion": "fastbic", "n_jobs": None} 15 | 16 | max_it = len(samples_range) * len(features_range) 17 | for n_samples in samples_range: 18 | for n_features in features_range: 19 | it += 1 20 | 21 | print("==============================") 22 | print("Iteration %03d of %03d" % (it, max_it)) 23 | print("==============================") 24 | print() 25 | print(f"n_samples: {n_samples} and n_features: {n_features}") 26 | data = nr.randint(-50, 51, (n_samples, n_features)) 27 | 28 | print("Unsupervised RF") 29 | tstart = time() 30 | est = UnsupervisedRandomForest(**est_params).fit(data) 31 | 32 | delta = time() - tstart 33 | max_depth = max(tree.get_depth() for tree in est.estimators_) 34 | print("Speed: %0.3fs" % delta) 35 | print("Max depth: %d" % max_depth) 36 | print() 37 | 38 | results["unsup_rf_speed"].append(delta) 39 | results["unsup_rf_depth"].append(max_depth) 40 | 41 | print("Unsupervised Oblique RF") 42 | # let's prepare the data in small chunks 43 | est = UnsupervisedObliqueRandomForest(**est_params) 44 | tstart = time() 45 | est.fit(data) 46 | delta = time() - tstart 47 | max_depth = max(tree.get_depth() for tree in est.estimators_) 48 | print("Speed: %0.3fs" % delta) 49 | print("Max depth: %d" % max_depth) 50 | print() 51 | print() 52 | 53 | results["unsup_obliquerf_speed"].append(delta) 54 | results["unsup_obliquerf_depth"].append(max_depth) 55 | 56 | return results 57 | 58 | 59 | if __name__ == "__main__": 60 | import matplotlib.pyplot as plt 61 | from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection 62 | 63 | samples_range = np.linspace(50, 150, 5).astype(int) 64 | features_range = np.linspace(150, 50000, 5).astype(int) 65 | chunks = np.linspace(500, 10000, 15).astype(int) 66 | 67 | results = compute_bench(samples_range, features_range) 68 | 69 | max_time = max([max(i) for i in [t for (label, t) in results.items() if "speed" in label]]) 70 | max_inertia = max( 71 | [max(i) for i in [t for (label, t) in results.items() if "speed" not in label]] 72 | ) 73 | 74 | fig = plt.figure("scikit-learn Unsupervised (Oblique and Axis) RF benchmark results") 75 | for c, (label, timings) in zip("brcy", sorted(results.items())): 76 | if "speed" in label: 77 | ax = fig.add_subplot(2, 1, 1, projection="3d") 78 | ax.set_zlim3d(0.0, max_time * 1.1) 79 | else: 80 | ax = fig.add_subplot(2, 1, 2, projection="3d") 81 | ax.set_zlim3d(0.0, max_inertia * 1.1) 82 | 83 | X, Y = np.meshgrid(samples_range, features_range) 84 | Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0]) 85 | ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5) 86 | ax.set_title(f"{label}") 87 | ax.set_xlabel("n_samples") 88 | ax.set_ylabel("n_features") 89 | 90 | plt.show() 91 | -------------------------------------------------------------------------------- /examples/treeple/treeple_tutorial_1_1b_MI.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============== 3 | Calculating MI 4 | ============== 5 | """ 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import seaborn as sns 10 | from scipy.stats import entropy 11 | 12 | from treeple.datasets import make_trunk_classification 13 | from treeple.ensemble import HonestForestClassifier 14 | from treeple.stats import build_oob_forest 15 | 16 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) 17 | PALETTE = sns.color_palette("Set1") 18 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9) 19 | sns.set_style("white", {"axes.edgecolor": "#dddddd"}) 20 | # %% 21 | # MI 22 | # -- 23 | # 24 | # Mutual Information (*MI*) measures the mutual dependence between *X* and 25 | # *Y*. It can be calculated by the difference between the class entropy 26 | # (``H(Y)``) and the conditional entropy (``H(Y | X)``): 27 | # 28 | # .. math:: I(X; Y) = H(Y) - H(Y\mid X) 29 | # 30 | # With a binary class simulation as an example, this tutorial will show 31 | # how to use ``treeple`` to calculate the statistic. 32 | 33 | # %% 34 | # Create a simulation with two gaussians 35 | # -------------------------------------- 36 | 37 | 38 | # create a binary class simulation with two gaussians 39 | # 500 samples for each class, class zero is standard 40 | # gaussian, and class one has a mean at one 41 | X, y = make_trunk_classification( 42 | n_samples=1000, 43 | n_dim=1, 44 | mu_0=0, 45 | mu_1=1, 46 | n_informative=1, 47 | seed=1, 48 | ) 49 | 50 | 51 | fig, ax = plt.subplots(figsize=(6, 6)) 52 | fig.tight_layout() 53 | ax.tick_params(labelsize=15) 54 | 55 | # histogram plot the samples 56 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative") 57 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive") 58 | ax.set_xlabel("Variable One", fontsize=15) 59 | ax.set_ylabel("Likelihood", fontsize=15) 60 | plt.legend(frameon=False, fontsize=15) 61 | plt.show() 62 | 63 | 64 | # %% 65 | # Fit the model 66 | # ------------- 67 | 68 | 69 | # initialize the forest with 100 trees 70 | est = HonestForestClassifier( 71 | n_estimators=100, 72 | max_samples=1.6, 73 | max_features=0.3, 74 | bootstrap=True, 75 | stratify=True, 76 | random_state=1, 77 | ) 78 | 79 | # fit the model and obtain the tree posteriors 80 | _, observe_proba = build_oob_forest(est, X, y) 81 | 82 | # generate forest posteriors for the two classes 83 | observe_proba = np.nanmean(observe_proba, axis=0) 84 | 85 | 86 | fig, ax = plt.subplots(figsize=(6, 6)) 87 | fig.tight_layout() 88 | ax.tick_params(labelsize=15) 89 | 90 | # histogram plot the posterior probabilities for class one 91 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative") 92 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive") 93 | ax.set_ylabel("# of Samples", fontsize=15) 94 | ax.set_xlabel("Class One Posterior", fontsize=15) 95 | plt.legend(frameon=False, fontsize=15) 96 | plt.show() 97 | 98 | 99 | # %% 100 | # Calculate the statistic 101 | # ----------------------- 102 | def Calculate_MI(y_true, y_pred_proba): 103 | # calculate the conditional entropy 104 | H_YX = np.mean(entropy(y_pred_proba, base=np.exp(1), axis=1)) 105 | 106 | # empirical count of each class (n_classes) 107 | _, counts = np.unique(y_true, return_counts=True) 108 | # calculate the entropy of labels 109 | H_Y = entropy(counts, base=np.exp(1)) 110 | return H_Y - H_YX 111 | 112 | 113 | mi = Calculate_MI(y, observe_proba) 114 | print("MI =", round(mi, 2)) 115 | -------------------------------------------------------------------------------- /treeple/_lib/meson.build: -------------------------------------------------------------------------------- 1 | fs = import('fs') 2 | if not fs.exists('sklearn') 3 | error('Missing the `sklearn` fork submodule! Run `git submodule update --init` to fix this.') 4 | endif 5 | 6 | # install tree/ submodule 7 | tree_extension_metadata = { 8 | '_tree': 9 | {'sources': ['./sklearn/tree/' + '_tree.pyx'], 10 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 11 | '_partitioner': 12 | {'sources': ['./sklearn/tree/' + '_partitioner.pyx'], 13 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 14 | '_splitter': 15 | {'sources': ['./sklearn/tree/' + '_splitter.pyx'], 16 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 17 | '_criterion': 18 | {'sources': ['./sklearn/tree/' + '_criterion.pyx'], 19 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 20 | '_utils': 21 | {'sources': ['./sklearn/tree/' + '_utils.pyx'], 22 | 'override_options': ['cython_language=cpp', 'optimization=3']}, 23 | } 24 | 25 | 26 | foreach ext_name, ext_dict : tree_extension_metadata 27 | py.extension_module( 28 | ext_name, 29 | ext_dict.get('sources'), 30 | dependencies: [np_dep], 31 | override_options : ext_dict.get('override_options', []), 32 | cython_args: cython_c_args, 33 | subdir: 'treeple/_lib/sklearn/tree/', 34 | install: true 35 | ) 36 | endforeach 37 | 38 | python_sources = [ 39 | './sklearn/tree/__init__.py', 40 | './sklearn/tree/_classes.py', 41 | './sklearn/tree/_export.py', 42 | './sklearn/tree/_reingold_tilford.py', 43 | ] 44 | 45 | py.install_sources( 46 | python_sources, 47 | subdir: 'treeple/_lib/sklearn/tree' # Folder relative to site-packages to install to 48 | ) 49 | 50 | # install ensemble/ submodule 51 | python_sources = [ 52 | '_forest.py', 53 | ] 54 | foreach py_source: python_sources 55 | py.install_sources( 56 | './sklearn/ensemble/' + py_source, 57 | subdir: 'treeple/_lib/sklearn/ensemble' 58 | ) 59 | endforeach 60 | 61 | # TODO: Can remove if included in scikit-learn eventually 62 | # install tree/ submodule 63 | extensions = [ 64 | '_quad_tree', 65 | ] 66 | 67 | foreach ext: extensions 68 | py.extension_module( 69 | ext, 70 | ['./sklearn/neighbors/' + ext + '.pyx'], 71 | c_args: c_args, 72 | dependencies: [np_dep], 73 | cython_args: cython_c_args, 74 | override_options : ['optimization=3', 'cython_language=cpp'], 75 | install: true, 76 | subdir: 'treeple/_lib/sklearn/neighbors/', 77 | ) 78 | endforeach 79 | 80 | # install tree/ submodule 81 | extensions = [ 82 | '_typedefs', 83 | '_random', 84 | ] 85 | 86 | foreach ext: extensions 87 | py.extension_module(ext, 88 | ['./sklearn/utils/' + ext + '.pyx'], 89 | c_args: c_args, 90 | dependencies: [np_dep], 91 | cython_args: cython_c_args, 92 | override_options : ['optimization=3', 'cython_language=cpp'], 93 | install: true, 94 | subdir: 'treeple/_lib/sklearn/utils/', 95 | ) 96 | endforeach 97 | 98 | 99 | # python_sources = [ 100 | # '__init__.py', 101 | # ] 102 | 103 | # py.install_sources( 104 | # python_sources, 105 | # subdir: 'treeple/_lib' # Folder relative to site-packages to install to 106 | # ) 107 | 108 | # tempita = files('./sklearn/_build_utils/tempita.py') 109 | 110 | # # Copy all the .py files to the install dir, rather than using 111 | # # py.install_sources and needing to list them explicitely one by one 112 | # # install_subdir('sklearn', install_dir: py.get_install_dir()) 113 | # install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib')) 114 | 115 | # subdir('sklearn') 116 | -------------------------------------------------------------------------------- /treeple/tree/unsupervised/_unsup_oblique_splitter.pxd: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from libcpp.vector cimport vector 4 | 5 | from ..._lib.sklearn.tree._splitter cimport SplitRecord 6 | from ..._lib.sklearn.tree._tree cimport ParentInfo 7 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t 8 | from .._oblique_splitter cimport ObliqueSplitRecord 9 | from ._unsup_splitter cimport UnsupervisedSplitter 10 | 11 | # cdef struct ObliqueSplitRecord: 12 | # # Data to track sample split 13 | # intp_t feature # Which feature to split on. 14 | # intp_t pos # Split samples array at the given position, 15 | # # # i.e. count of samples below threshold for feature. 16 | # # # pos is >= end if the node is a leaf. 17 | # float64_t threshold # Threshold to split at. 18 | # float64_t improvement # Impurity improvement given parent node. 19 | # float64_t impurity_left # Impurity of the left split. 20 | # float64_t impurity_right # Impurity of the right split. 21 | # intp_t n_constant_features # Number of constant features in the split. 22 | 23 | # vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) 24 | # vector[intp_t]* proj_vec_indices # indices of the features (max_features,) 25 | 26 | 27 | cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): 28 | """ 29 | Notable changes wrt scikit-learn: 30 | 1. `weighted_n_node_samples` is used as a stopping criterion and just used to 31 | keep count of the "number of samples (weighted)". All samples have a default weight 32 | of '1'. 33 | 2. `X` array instead of `y` array is stored as the criterions are computed over the X 34 | array. 35 | """ 36 | 37 | # Oblique Splitting extra parameters 38 | cdef public float64_t feature_combinations # Number of features to combine 39 | cdef intp_t n_non_zeros # Number of non-zero features 40 | cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix 41 | cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix 42 | cdef intp_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features 43 | 44 | # All oblique splitters (i.e. non-axis aligned splitters) require a 45 | # function to sample a projection matrix that is applied to the feature matrix 46 | # to quickly obtain the sampled projections for candidate splits. 47 | cdef void sample_proj_mat(self, 48 | vector[vector[float32_t]]& proj_mat_weights, 49 | vector[vector[intp_t]]& proj_mat_indices) noexcept nogil 50 | 51 | # Redefined here since the new logic requires calling sample_proj_mat 52 | cdef int node_reset(self, intp_t start, intp_t end, 53 | float64_t* weighted_n_node_samples) except -1 nogil 54 | 55 | cdef int node_split( 56 | self, 57 | ParentInfo* parent, 58 | SplitRecord* split, 59 | ) except -1 nogil 60 | cdef int init( 61 | self, 62 | const float32_t[:, :] X, 63 | const float64_t[:] sample_weight 64 | ) except -1 65 | cdef intp_t pointer_size(self) noexcept nogil 66 | 67 | cdef void compute_features_over_samples( 68 | self, 69 | intp_t start, 70 | intp_t end, 71 | const intp_t[:] samples, 72 | float32_t[:] feature_values, 73 | vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) 74 | vector[intp_t]* proj_vec_indices # indices of the features (max_features,) 75 | ) noexcept nogil 76 | -------------------------------------------------------------------------------- /examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================================================================================== 3 | Plot oblique forest and axis-aligned random forest predictions on sparse parity simulation 4 | ========================================================================================== 5 | A performance comparison between oblique forest and standard axis- 6 | aligned random forest using sparse parity simulation dataset. 7 | Sparse parity is a variation of the noisy parity problem, 8 | which itself is a multivariate generalization of the noisy XOR problem. 9 | This is a binary classification task in high dimensions. The simulation 10 | will generate uniformly distributed `n_samples` number of sample points 11 | in the range of -1 and +1 with `p` number of features. `p*` is a 12 | parameter used to limit features that carry information about the class. 13 | The informative binary label is then defined as 1 if there are odd number 14 | of the sum of data `X` across first `p*` features that are greater than 0, 15 | otherwise the label is defined as 0. The simulation is further detailed 16 | in this [publication](https://epubs.siam.org/doi/epdf/10.1137/1.9781611974973.56). 17 | """ 18 | 19 | from datetime import datetime 20 | 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | import pandas as pd 24 | import seaborn as sns 25 | from sklearn.ensemble import RandomForestClassifier 26 | from sklearn.model_selection import RepeatedKFold, cross_validate 27 | 28 | from treeple import ObliqueRandomForestClassifier 29 | 30 | random_state = 123456 31 | t0 = datetime.now() 32 | 33 | 34 | def sparse_parity(n_samples, p=20, p_star=3, random_seed=None, **kwargs): 35 | if random_seed: 36 | np.random.seed(random_seed) 37 | 38 | X = np.random.uniform(-1, 1, (n_samples, p)) 39 | y = np.zeros(n_samples) 40 | 41 | for i in range(0, n_samples): 42 | y[i] = sum(X[i, :p_star] > 0) % 2 43 | 44 | return X, y 45 | 46 | 47 | def get_scores(X, y, n_cv=5, n_repeats=1, random_state=1, kwargs=None): 48 | clfs = [ 49 | RandomForestClassifier(**kwargs[0], random_state=random_state), 50 | ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state), 51 | ] 52 | 53 | tmp = [] 54 | 55 | for i, clf in enumerate(clfs): 56 | cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=random_state) 57 | test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy") 58 | 59 | tmp.append([["RF", "OF"][i], test_score["test_score"], test_score["test_score"].mean()]) 60 | 61 | df = pd.DataFrame(tmp, columns=["model", "score", "mean"]) 62 | df = df.explode("score") 63 | df["score"] = df["score"].astype(float) 64 | df.reset_index(inplace=True, drop=True) 65 | 66 | return df 67 | 68 | 69 | # Grid searched hyper-parameters 70 | params = [ 71 | {"max_features": None, "n_estimators": 100, "max_depth": None}, 72 | {"max_features": 40, "n_estimators": 100, "max_depth": 20}, 73 | ] 74 | 75 | X, y = sparse_parity(n_samples=1000, random_seed=random_state) 76 | 77 | df = get_scores(X=X, y=y, n_cv=3, n_repeats=1, random_state=random_state, kwargs=params) 78 | t_d = (datetime.now() - t0).seconds 79 | print(f"It took {t_d} seconds to run the script") 80 | 81 | # Draw a comparison plot 82 | fig, ax = plt.subplots(1, 1, figsize=(6, 6)) 83 | 84 | sns.stripplot(data=df, x="model", y="score", ax=ax, dodge=True) 85 | sns.boxplot(data=df, x="model", y="score", ax=ax, color="white") 86 | ax.set_title("Sparse Parity") 87 | 88 | rf = df.query('model=="RF"')["mean"].iloc[0] 89 | rff = f"RF (Mean Test Score: {round(rf,3)})" 90 | 91 | of = df.query('model=="OF"')["mean"].iloc[0] 92 | off = f"OF (Mean Test Score: {round(of,3)})" 93 | 94 | ax.legend([rff, off], loc=4) 95 | 96 | plt.savefig(f"plot_sim_{t_d}s.jpg") 97 | plt.show() 98 | -------------------------------------------------------------------------------- /treeple/__init__.py: -------------------------------------------------------------------------------- 1 | """Scikit manifold oblique random forests.""" 2 | 3 | import logging 4 | import os 5 | import sys 6 | 7 | __version__ = "0.10.3" 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded 12 | # simultaneously. This can happen for instance when calling BLAS inside a 13 | # prange. Setting the following environment variable allows multiple OpenMP 14 | # libraries to be loaded. It should not degrade performances since we manually 15 | # take care of potential over-subcription performance issues, in sections of 16 | # the code where nested OpenMP loops can happen, by dynamically reconfiguring 17 | # the inner OpenMP runtime to temporarily disable it while under the scope of 18 | # the outer OpenMP parallel section. 19 | os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True") 20 | 21 | # Workaround issue discovered in intel-openmp 2019.5: 22 | # https://github.com/ContinuumIO/anaconda-issues/issues/11294 23 | os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE") 24 | 25 | 26 | try: 27 | # This variable is injected in the __builtins__ by the build 28 | # process. It is used to enable importing subpackages of sklearn when 29 | # the binaries are not built 30 | __treeple_SETUP__ # type: ignore 31 | except NameError: 32 | __treeple_SETUP__ = False 33 | 34 | if __treeple_SETUP__: 35 | sys.stderr.write("Running from treeple source directory.\n") 36 | sys.stderr.write("Partial import of treeple during the build process.\n") 37 | # We are not importing the rest of treeple during the build 38 | # process, as it may not be compiled yet 39 | else: 40 | try: 41 | from . import _lib, tree, ensemble, experimental, stats 42 | from ._lib.sklearn.ensemble._forest import ( 43 | RandomForestClassifier, 44 | RandomForestRegressor, 45 | ExtraTreesClassifier, 46 | ExtraTreesRegressor, 47 | ) 48 | from .neighbors import NearestNeighborsMetaEstimator 49 | from .ensemble import ExtendedIsolationForest, MultiViewRandomForestClassifier 50 | from .ensemble._unsupervised_forest import ( 51 | UnsupervisedRandomForest, 52 | UnsupervisedObliqueRandomForest, 53 | ) 54 | from .ensemble._supervised_forest import ( 55 | ExtraObliqueRandomForestClassifier, 56 | ExtraObliqueRandomForestRegressor, 57 | ObliqueRandomForestClassifier, 58 | ObliqueRandomForestRegressor, 59 | PatchObliqueRandomForestClassifier, 60 | PatchObliqueRandomForestRegressor, 61 | ) 62 | from .ensemble._honest_forest import HonestForestClassifier 63 | except ImportError as e: 64 | print(e.msg) 65 | msg = """Error importing treeple: you cannot import treeple while 66 | being in treeple source directory; please exit the treeple source 67 | tree first and relaunch your Python interpreter.""" 68 | raise Exception(e) 69 | # raise ImportError(msg) from e 70 | 71 | __all__ = [ 72 | "_lib", 73 | "tree", 74 | "experimental", 75 | "ensemble", 76 | "stats", 77 | "ExtraObliqueRandomForestClassifier", 78 | "ExtraObliqueRandomForestRegressor", 79 | "NearestNeighborsMetaEstimator", 80 | "ObliqueRandomForestClassifier", 81 | "ObliqueRandomForestRegressor", 82 | "PatchObliqueRandomForestClassifier", 83 | "PatchObliqueRandomForestRegressor", 84 | "UnsupervisedRandomForest", 85 | "UnsupervisedObliqueRandomForest", 86 | "HonestForestClassifier", 87 | "RandomForestClassifier", 88 | "RandomForestRegressor", 89 | "ExtraTreesClassifier", 90 | "ExtraTreesRegressor", 91 | "ExtendedIsolationForest", 92 | "MultiViewRandomForestClassifier", 93 | ] 94 | -------------------------------------------------------------------------------- /examples/quantile_predictions/plot_quantile_vs_standard_oblique_forest.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================================== 3 | Quantile regression with oblique regression forest 4 | ============================================================== 5 | 6 | An example to generate quantile predictions using an oblique random forest 7 | instance on a synthetic, right-skewed dataset. 8 | 9 | This example was heavily inspired by `quantile-forest `_ 10 | package. See their package `here `_. 11 | """ 12 | 13 | from collections import defaultdict 14 | 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | import scipy as sp 18 | from sklearn.model_selection import train_test_split 19 | from sklearn.utils.validation import check_random_state 20 | 21 | from treeple.ensemble import ObliqueRandomForestRegressor 22 | 23 | rng = check_random_state(0) 24 | 25 | # %% 26 | # Generate the data 27 | # ----------------- 28 | # We use a synthetic dataset with 2 features and 5000 samples. The target is 29 | # generated from a skewed normal distribution. (The mean of the distribution 30 | # is to the right of the median.) 31 | 32 | n_samples = 5000 33 | a, loc, scale = 5, -1, 1 34 | skewnorm_rv = sp.stats.skewnorm(a, loc, scale) 35 | skewnorm_rv.random_state = rng 36 | y = skewnorm_rv.rvs(n_samples) 37 | X = rng.randn(n_samples, 2) * y.reshape(-1, 1) 38 | 39 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 40 | 41 | regr_orf = ObliqueRandomForestRegressor(n_estimators=10, random_state=0) 42 | 43 | regr_orf.fit(X_train, y_train) 44 | 45 | y_pred_orf = regr_orf.predict(X_test) 46 | # %% 47 | # Generate Quantile Predictions 48 | # ----------------------------- 49 | # The idea is for each prediction, the training samples that fell into the same leaf nodes 50 | # are collected then used to generate the quantile statistics for the desired prediction. 51 | 52 | # Get the leaf-nodes the training samples fall into 53 | leaf_ids = regr_orf.apply(X_train) 54 | # create a list of dictionary that maps node to samples that fell into it 55 | # for each tree 56 | node_to_indices = [] 57 | for tree in range(leaf_ids.shape[1]): 58 | d = defaultdict(list) 59 | for id, leaf in enumerate(leaf_ids[:, tree]): 60 | d[leaf].append(id) 61 | node_to_indices.append(d) 62 | # drop the X_test to the trained tree and 63 | # get the indices of leaf nodes that fall into it 64 | leaf_ids_test = regr_orf.apply(X_test) 65 | # for each samples, collect the indices of the samples that fell into 66 | # the same leaf node for each tree 67 | y_pred_quantile = [] 68 | for sample in range(leaf_ids_test.shape[0]): 69 | li = [ 70 | node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) 71 | ] 72 | # merge the list of indices into one 73 | idx = [item for sublist in li for item in sublist] 74 | # get the y_train for each corresponding id 75 | y_pred_quantile.append(y_train[idx]) 76 | # get the quatile preditions for each predicted sample 77 | y_pred_quantile = [np.quantile(y_pred_quantile[i], 0.5) for i in range(len(y_pred_quantile))] 78 | 79 | # %% 80 | # Plot the results 81 | # ---------------- 82 | # The plot shows the distribution of the actual target values and the predicted median 83 | # (i.e. 0.5 quantile), and the mean prediction by the regular random forest regressor. 84 | # In this skewed dataset, the median prediction using the quantile method works better at 85 | # predicting the off-centered target distribution than the regular mean prediction. 86 | 87 | colors = ["#c0c0c0", "#a6e5ff", "#e7a4f5"] 88 | names = ["Actual", "QRF (Median)", "ORF (Mean)"] 89 | plt.hist([y_test, y_pred_quantile, y_pred_orf], bins=50, color=colors, label=names) 90 | plt.xlabel("Actual and Predicted Target Values") 91 | plt.ylabel("Counts") 92 | plt.legend() 93 | plt.show() 94 | -------------------------------------------------------------------------------- /examples/quantile_predictions/plot_quantile_toy_example_with_RF.py: -------------------------------------------------------------------------------- 1 | """ 2 | ====================================================== 3 | Quantile prediction with Random Forest Regressor class 4 | ====================================================== 5 | 6 | An example that demonstrates how to use the Random Forest to generate 7 | quantile predictions such as conditional median and prediction intervals. 8 | The example compares the predictions to a ground truth function used 9 | to generate noisy samples. 10 | 11 | This example was heavily inspired by `quantile-forest `_ 12 | package. See their package `here `_. 13 | """ 14 | 15 | from collections import defaultdict 16 | 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | from sklearn.ensemble import RandomForestRegressor 20 | from sklearn.model_selection import train_test_split 21 | 22 | # %% 23 | # Generate the data 24 | 25 | 26 | def make_toy_dataset(n_samples, seed=0): 27 | rng = np.random.RandomState(seed) 28 | 29 | x = rng.uniform(0, 10, size=n_samples) 30 | f = x * np.sin(x) 31 | 32 | sigma = 0.25 + x / 10 33 | noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) 34 | y = f + noise 35 | 36 | return np.atleast_2d(x).T, y 37 | 38 | 39 | n_samples = 1000 40 | X, y = make_toy_dataset(n_samples) 41 | 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 43 | 44 | xx = np.atleast_2d(np.linspace(0, 10, n_samples)).T 45 | 46 | 47 | # %% 48 | # Fit the model to the training samples 49 | # ------------------------------------- 50 | 51 | rf = RandomForestRegressor(max_depth=3, random_state=0) 52 | rf.fit(X_train, y_train) 53 | 54 | y_pred = rf.predict(xx) 55 | 56 | # get the leaf nodes that each sample fell into 57 | leaf_ids = rf.apply(X_train) 58 | # create a list of dictionary that maps node to samples that fell into it 59 | # for each tree 60 | node_to_indices = [] 61 | for tree in range(leaf_ids.shape[1]): 62 | d = defaultdict(list) 63 | for id, leaf in enumerate(leaf_ids[:, tree]): 64 | d[leaf].append(id) 65 | node_to_indices.append(d) 66 | # drop the X_test to the trained tree and 67 | # get the indices of leaf nodes that fall into it 68 | leaf_ids_test = rf.apply(xx) 69 | # for each samples, collect the indices of the samples that fell into 70 | # the same leaf node for each tree 71 | y_pred_quatile = [] 72 | for sample in range(leaf_ids_test.shape[0]): 73 | li = [ 74 | node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) 75 | ] 76 | # merge the list of indices into one 77 | idx = [item for sublist in li for item in sublist] 78 | # get the y_train for each corresponding id 79 | y_pred_quatile.append(y_train[idx]) 80 | # get the quatile preditions for each predicted sample 81 | y_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))] 82 | y_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))] 83 | y_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))] 84 | 85 | # %% 86 | # Plot the results 87 | # ---------------- 88 | # Plot the conditional median and prediction intervals. 89 | # The blue line is the predicted median and the shaded area indicates the 95% confidence interval 90 | # of the prediction. The dots are the training data and the black line indicates the function that 91 | # is used to generated those samples. 92 | 93 | plt.plot(X_test, y_test, ".", c="#f2a619", label="Test Observations", ms=5) 94 | plt.plot(xx, (xx * np.sin(xx)), c="black", label="$f(x) = x\,\sin(x)$", lw=2) 95 | plt.plot(xx, y_pred_med, c="#006aff", label="Predicted Median", lw=3, ms=5) 96 | plt.fill_between( 97 | xx.ravel(), 98 | y_pred_low, 99 | y_pred_upp, 100 | color="#e0f2ff", 101 | label="Predicted 95% Interval", 102 | ) 103 | plt.xlabel("$x$") 104 | plt.ylabel("$f(x)$") 105 | plt.legend(loc="upper left") 106 | plt.show() 107 | -------------------------------------------------------------------------------- /doc/modules/ensemble.rst: -------------------------------------------------------------------------------- 1 | .. _oblique_forests: 2 | 3 | Oblique Random Forests 4 | ---------------------- 5 | 6 | In oblique random forests (see :class:`~treeple.ObliqueRandomForestClassifier` and 7 | `ObliqueRandomForestRegressor` classes), each tree in the ensemble is built 8 | from a sample drawn with replacement (i.e., a bootstrap sample) from the 9 | training set. The oblique random forest is the same as that of a random forest, 10 | except in how the splits are computed in each tree. 11 | 12 | Similar to how random forests achieve a reduced variance by combining diverse trees, 13 | sometimes at the cost of a slight increase in bias, oblique random forests aim to do the same. 14 | They are motivated to construct even more diverse trees, thereby improving model generalization. 15 | In practice the variance reduction is often significant hence yielding an overall better model. 16 | 17 | In contrast to the original publication :footcite:`breiman2001random`, the scikit-learn 18 | implementation allows the user to control the number of features to combine in computing 19 | candidate splits. This is done via the ``feature_combinations`` parameter. For 20 | more information and intuition, see 21 | :ref:`documentation on oblique decision trees `. 22 | 23 | .. topic:: Examples: 24 | 25 | * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py` 26 | * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_axis_aligned_forests_sparse_parity.py` 27 | 28 | .. topic:: References 29 | 30 | .. footbibliography:: 31 | 32 | .. _oblique_forest_feature_importance: 33 | 34 | Feature importance evaluation 35 | ----------------------------- 36 | 37 | The relative rank (i.e. depth) of a feature used as a decision node in a 38 | tree can be used to assess the relative importance of that feature with 39 | respect to the predictability of the target variable. Features used at 40 | the top of the tree contribute to the final prediction decision of a 41 | larger fraction of the input samples. The **expected fraction of the 42 | samples** they contribute to can thus be used as an estimate of the 43 | **relative importance of the features**. In treeple, the fraction of 44 | samples a feature contributes to is combined with the decrease in impurity 45 | from splitting them to create a normalized estimate of the predictive power 46 | of that feature. This is essentially exactly the same it is done in scikit-learn. 47 | 48 | By **averaging** the estimates of predictive ability over several randomized 49 | trees one can **reduce the variance** of such an estimate and use it 50 | for feature selection. This is known as the mean decrease in impurity, or MDI. 51 | Refer to [L2014]_ for more information on MDI and feature importance 52 | evaluation with Random Forests. We implement the approach taken in :footcite:`Li2023manifold` 53 | and :footcite:`TomitaSPORF2020`. 54 | 55 | .. warning:: 56 | 57 | The impurity-based feature importances computed on tree-based models suffer 58 | from two flaws that can lead to misleading conclusions. First they are 59 | computed on statistics derived from the training dataset and therefore **do 60 | not necessarily inform us on which features are most important to make good 61 | predictions on held-out dataset**. Secondly, **they favor high cardinality 62 | features**, that is features with many unique values. 63 | :ref:`sklearn:permutation_importance` is an alternative to impurity-based feature 64 | importance that does not suffer from these flaws. These two methods of 65 | obtaining feature importance are explored in: 66 | :ref:`sklearn:sphx_glr_auto_examples_inspection_plot_permutation_importance.py`. 67 | 68 | In practice those estimates are stored as an attribute named 69 | ``feature_importances_`` on the fitted model. This is an array with shape 70 | ``(n_features,)`` whose values are positive and sum to 1.0. The higher 71 | the value, the more important is the contribution of the matching feature 72 | to the prediction function. 73 | 74 | .. topic:: References 75 | 76 | .. footbibliography:: 77 | 78 | .. [L2014] Louppe, G. :arxiv:`"Understanding Random Forests: From Theory to 79 | Practice" <1407.7502>`, 80 | PhD Thesis, U. of Liege, 2014. 81 | -------------------------------------------------------------------------------- /treeple/tree/manifold/_morf_splitter.pxd: -------------------------------------------------------------------------------- 1 | # distutils: language = c++ 2 | 3 | # Authors: Adam Li 4 | # Chester Huynh 5 | # Parth Vora 6 | # 7 | # License: BSD 3 clause 8 | 9 | # See _oblique_splitter.pyx for details. 10 | 11 | import numpy as np 12 | 13 | from libcpp.vector cimport vector 14 | 15 | from ..._lib.sklearn.tree._splitter cimport SplitRecord 16 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t 17 | from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord 18 | 19 | # https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/algorithm.pxd 20 | # shows how to include standard library functions in Cython 21 | # This includes the discrete_distribution C++ class, which can be used 22 | # to generate samples from a discrete distribution with non-uniform probabilities. 23 | # cdef extern from "" namespace "std" nogil: 24 | # cdef cppclass discrete_distribution[T] 25 | # ctypedef T int_type 26 | # ctypedef G generator_type 27 | # discrete_distribution(T first, T last) except + 28 | # operator()(&G) except + 29 | 30 | cdef class PatchSplitter(BestObliqueSplitter): 31 | # The PatchSplitter creates candidate feature values by sampling 2D patches from 32 | # an input data vector. The input data is vectorized, so `data_height` and 33 | # `data_width` are used to determine the vectorized indices corresponding to 34 | # (x,y) coordinates in the original un-vectorized data. 35 | cdef public intp_t ndim # The number of dimensions of the input data 36 | 37 | cdef const intp_t[:] data_dims # The dimensions of the input data 38 | cdef const intp_t[:] min_patch_dims # The minimum size of the patch to sample in each dimension 39 | cdef const intp_t[:] max_patch_dims # The maximum size of the patch to sample in each dimension 40 | cdef const uint8_t[:] dim_contiguous # A boolean array indicating whether each dimension is contiguous 41 | 42 | # TODO: check if this works and is necessary for discontiguous data 43 | # cdef intp_t[:] stride_offsets # The stride offsets for each dimension 44 | cdef bint _discontiguous 45 | 46 | cdef bytes boundary # how to sample the patch with boundary in mind 47 | cdef const float32_t[:, :] feature_weight # Whether or not to normalize each column of X when adding in a patch 48 | 49 | cdef intp_t[::1] _index_data_buffer 50 | cdef intp_t[::1] _index_patch_buffer 51 | cdef intp_t[:] patch_sampled_size # A buffer to store the dimensions of the sampled patch 52 | cdef intp_t[:] unraveled_patch_point # A buffer to store the unraveled patch point 53 | 54 | # All oblique splitters (i.e. non-axis aligned splitters) require a 55 | # function to sample a projection matrix that is applied to the feature matrix 56 | # to quickly obtain the sampled projections for candidate splits. 57 | cdef (intp_t, intp_t) sample_top_left_seed( 58 | self 59 | ) noexcept nogil 60 | 61 | cdef void sample_proj_mat( 62 | self, 63 | vector[vector[float32_t]]& proj_mat_weights, 64 | vector[vector[intp_t]]& proj_mat_indices 65 | ) noexcept nogil 66 | 67 | 68 | # cdef class UserKernelSplitter(PatchSplitter): 69 | # """A class to hold user-specified kernels.""" 70 | # cdef vector[float32_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels 71 | 72 | 73 | cdef class GaussianKernelSplitter(PatchSplitter): 74 | """A class to hold Gaussian kernels. 75 | 76 | Overrides the weights that are generated to be sampled from a Gaussian distribution. 77 | See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus 78 | See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567 79 | """ 80 | 81 | cdef void sample_proj_mat( 82 | self, 83 | vector[vector[float32_t]]& proj_mat_weights, 84 | vector[vector[intp_t]]& proj_mat_indices 85 | ) noexcept nogil 86 | -------------------------------------------------------------------------------- /treeple/stats/tests/test_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from numpy.testing import assert_array_equal 4 | 5 | from treeple import HonestForestClassifier 6 | from treeple.stats import ( 7 | PermutationHonestForestClassifier, 8 | build_cv_forest, 9 | build_permutation_forest, 10 | ) 11 | 12 | seed = 12345 13 | rng = np.random.default_rng(seed) 14 | 15 | 16 | @pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)]) 17 | def test_build_cv_honest_forest(bootstrap, max_samples): 18 | n_estimators = 100 19 | est = HonestForestClassifier( 20 | n_estimators=n_estimators, 21 | random_state=0, 22 | bootstrap=bootstrap, 23 | max_samples=max_samples, 24 | honest_fraction=0.5, 25 | stratify=True, 26 | ) 27 | X = rng.normal(0, 1, (100, 2)) 28 | X[:50] *= -1 29 | y = np.array([0, 1] * 50) 30 | samples = np.arange(len(y)) 31 | 32 | est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest( 33 | est, 34 | X, 35 | y, 36 | return_indices=True, 37 | seed=seed, 38 | cv=3, 39 | ) 40 | 41 | assert isinstance(est_list, list) 42 | assert isinstance(proba_list, list) 43 | 44 | for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list): 45 | assert len(train_idx) + len(test_idx) == len(samples) 46 | structure_samples = est.structure_indices_ 47 | leaf_samples = est.honest_indices_ 48 | 49 | if not bootstrap: 50 | oob_samples = [[] for _ in range(est.n_estimators)] 51 | else: 52 | oob_samples = est.oob_samples_ 53 | 54 | # compared to oob samples, now the train samples are comprised of the entire dataset 55 | # seen over the entire forest. The test dataset is completely disjoint 56 | for tree_idx in range(est.n_estimators): 57 | n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) 58 | assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), ( 59 | f"For tree: " 60 | f"{tree_idx} {len(structure_samples[tree_idx])} + " 61 | f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} " 62 | f"!= {len(train_idx)} {len(test_idx)}" 63 | ) 64 | 65 | 66 | def test_build_permutation_forest(): 67 | """Simple test for building a permutation forest.""" 68 | n_estimators = 30 69 | n_samples = 100 70 | n_features = 3 71 | rng = np.random.default_rng(seed) 72 | 73 | _X = rng.uniform(size=(n_samples, n_features)) 74 | _X = rng.uniform(size=(n_samples // 2, n_features)) 75 | X2 = _X + 10 76 | X = np.vstack([_X, X2]) 77 | y = np.vstack( 78 | [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] 79 | ) # Binary classification 80 | 81 | clf = HonestForestClassifier( 82 | n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True 83 | ) 84 | perm_clf = PermutationHonestForestClassifier( 85 | n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True 86 | ) 87 | with pytest.raises( 88 | RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" 89 | ): 90 | build_permutation_forest(clf, clf, X, y, seed=seed) 91 | 92 | forest_result, orig_forest_proba, perm_forest_proba = build_permutation_forest( 93 | clf, perm_clf, X, y, metric="s@98", n_repeats=20, seed=seed 94 | ) 95 | assert forest_result.observe_test_stat > 0.1, f"{forest_result.observe_stat}" 96 | assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" 97 | assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) 98 | 99 | X = np.vstack([_X, _X]) 100 | forest_result, _, _ = build_permutation_forest( 101 | clf, perm_clf, X, y, metric="s@98", n_repeats=10, seed=seed 102 | ) 103 | assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" 104 | assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}" 105 | -------------------------------------------------------------------------------- /examples/quantile_predictions/plot_quantile_interpolation_with_RF.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======================================================== 3 | Predicting with different quantile interpolation methods 4 | ======================================================== 5 | 6 | An example comparison of interpolation methods that can be applied during 7 | prediction when the desired quantile lies between two data points. 8 | 9 | This example was heavily inspired by `quantile-forest `_ 10 | package. See their package `here `_. 11 | """ 12 | 13 | from collections import defaultdict 14 | 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | from sklearn.ensemble import RandomForestRegressor 18 | 19 | # %% 20 | # Generate the data 21 | # ----------------- 22 | # We use four simple data points to illustrate the difference between the intervals that are 23 | # generated using different interpolation methods. 24 | 25 | X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) 26 | y = np.array([-2, -1, 0, 1, 2]) 27 | 28 | # %% 29 | # The interpolation methods 30 | # ------------------------- 31 | # The following interpolation methods demonstrated here are: 32 | # To interpolate between the data points, i and j (``i <= j``), 33 | # linear, lower, higher, midpoint, or nearest. For more details, see `treeple.RandomForestRegressor`. 34 | # The difference between the methods can be illustrated with the following example: 35 | 36 | interpolations = ["linear", "lower", "higher", "midpoint", "nearest"] 37 | colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"] 38 | quantiles = [0.025, 0.5, 0.975] 39 | 40 | y_medians = [] 41 | y_errs = [] 42 | est = RandomForestRegressor( 43 | n_estimators=1, 44 | random_state=0, 45 | ) 46 | # fit the model 47 | est.fit(X, y) 48 | # get the leaf nodes that each sample fell into 49 | leaf_ids = est.apply(X) 50 | # create a list of dictionary that maps node to samples that fell into it 51 | # for each tree 52 | node_to_indices = [] 53 | for tree in range(leaf_ids.shape[1]): 54 | d = defaultdict(list) 55 | for id, leaf in enumerate(leaf_ids[:, tree]): 56 | d[leaf].append(id) 57 | node_to_indices.append(d) 58 | # drop the X_test to the trained tree and 59 | # get the indices of leaf nodes that fall into it 60 | leaf_ids_test = est.apply(X) 61 | # for each samples, collect the indices of the samples that fell into 62 | # the same leaf node for each tree 63 | y_pred_quantile = [] 64 | for sample in range(leaf_ids_test.shape[0]): 65 | li = [ 66 | node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) 67 | ] 68 | # merge the list of indices into one 69 | idx = [item for sublist in li for item in sublist] 70 | # get the y_train for each corresponding id`` 71 | y_pred_quantile.append(y[idx]) 72 | 73 | for interpolation in interpolations: 74 | # get the quatile preditions for each predicted sample 75 | y_pred = [ 76 | np.array( 77 | [ 78 | np.quantile(y_pred_quantile[i], quantile, method=interpolation) 79 | for i in range(len(y_pred_quantile)) 80 | ] 81 | ) 82 | for quantile in quantiles 83 | ] 84 | y_medians.append(y_pred[1]) 85 | y_errs.append( 86 | np.concatenate( 87 | ( 88 | [y_pred[1] - y_pred[0]], 89 | [y_pred[2] - y_pred[1]], 90 | ), 91 | axis=0, 92 | ) 93 | ) 94 | 95 | sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10) 96 | ebs = [] 97 | for i, (median, y_err) in enumerate(zip(y_medians, y_errs)): 98 | ebs.append( 99 | plt.errorbar( 100 | np.arange(len(y)) + (0.15 * (i + 1)) - 0.35, 101 | median, 102 | yerr=y_err, 103 | color=colors[i], 104 | ecolor=colors[i], 105 | fmt="o", 106 | ) 107 | ) 108 | plt.xlim([-0.75, len(y) - 0.25]) 109 | plt.xticks(np.arange(len(y)), X.tolist()) 110 | plt.xlabel("Samples (Feature Values)") 111 | plt.ylabel("Actual and Predicted Values") 112 | plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2) 113 | plt.show() 114 | -------------------------------------------------------------------------------- /treeple/tests/test_extensions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from numpy.testing import assert_array_equal 4 | from sklearn.datasets import make_classification 5 | 6 | from treeple import ( 7 | ExtraObliqueRandomForestClassifier, 8 | ExtraObliqueRandomForestRegressor, 9 | HonestForestClassifier, 10 | ObliqueRandomForestClassifier, 11 | ObliqueRandomForestRegressor, 12 | PatchObliqueRandomForestClassifier, 13 | PatchObliqueRandomForestRegressor, 14 | ) 15 | 16 | 17 | @pytest.mark.parametrize("n_classes", [2, 3]) 18 | @pytest.mark.parametrize( 19 | "Forest", 20 | [ 21 | HonestForestClassifier, 22 | ExtraObliqueRandomForestClassifier, 23 | ObliqueRandomForestClassifier, 24 | PatchObliqueRandomForestClassifier, 25 | ], 26 | ) 27 | def test_predict_proba_per_tree(Forest, n_classes): 28 | # Assuming forest_model is an instance of a forest model with ForestMixin 29 | # You may need to adjust the actual implementation according to your specific model 30 | X, y = make_classification( 31 | n_samples=100, n_features=50, n_informative=20, n_classes=n_classes, random_state=0 32 | ) 33 | 34 | # Call the method being tested 35 | if Forest == HonestForestClassifier: 36 | est = Forest(n_estimators=10, bootstrap=True, random_state=0, honest_prior="empirical") 37 | else: 38 | est = Forest(n_estimators=10, bootstrap=True, random_state=0) 39 | est.fit(X, y) 40 | proba_per_tree = est.predict_proba_per_tree(X) 41 | 42 | # Perform assertions to check the correctness of the output 43 | assert proba_per_tree.shape[0] == est.n_estimators 44 | assert proba_per_tree.shape[1] == X.shape[0] 45 | assert proba_per_tree.shape[2] == est.n_classes_ 46 | assert not np.isnan(proba_per_tree).any() 47 | 48 | proba_per_tree = est.predict_proba_per_tree(X, est.oob_samples_) 49 | # Perform assertions to check the correctness of the output 50 | assert proba_per_tree.shape[0] == est.n_estimators 51 | assert proba_per_tree.shape[1] == X.shape[0] 52 | assert proba_per_tree.shape[2] == est.n_classes_ 53 | assert np.isnan(proba_per_tree).any() 54 | 55 | 56 | @pytest.mark.parametrize( 57 | "Forest", 58 | [ 59 | HonestForestClassifier, 60 | ExtraObliqueRandomForestClassifier, 61 | ObliqueRandomForestClassifier, 62 | PatchObliqueRandomForestClassifier, 63 | ObliqueRandomForestRegressor, 64 | PatchObliqueRandomForestRegressor, 65 | ExtraObliqueRandomForestRegressor, 66 | ], 67 | ) 68 | @pytest.mark.parametrize("bootstrap", [True, False]) 69 | @pytest.mark.parametrize("random_state", [None, 0]) 70 | def test_forest_has_deterministic_sampling_for_oob_structure_and_leaves( 71 | Forest, bootstrap, random_state 72 | ): 73 | """Test that forest models can produce the oob and inbag samples deterministically. 74 | 75 | When bootstrap is True, oob should be exclusive from in bag samples. 76 | When bootstrap is False, there is no oob. 77 | """ 78 | rng = np.random.default_rng(0) 79 | 80 | n_estimators = 5 81 | est = Forest( 82 | n_estimators=n_estimators, 83 | random_state=random_state, 84 | bootstrap=bootstrap, 85 | ) 86 | X = rng.normal(0, 1, (100, 2)) 87 | X[:50] *= -1 88 | y = [0, 1] * 50 89 | samples = np.arange(len(y)) 90 | 91 | est.fit(X, y) 92 | 93 | inbag_samples = est.estimators_samples_ 94 | oob_samples = [ 95 | [idx for idx in samples if idx not in inbag_samples[jdx]] for jdx in range(n_estimators) 96 | ] 97 | if not bootstrap: 98 | assert all(oob_list_ == [] for oob_list_ in oob_samples) 99 | 100 | with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"): 101 | est.oob_samples_ 102 | else: 103 | oob_samples_ = est.oob_samples_ 104 | for itree in range(n_estimators): 105 | assert len(oob_samples[itree]) > 1, oob_samples[itree] 106 | assert set(inbag_samples[itree]).intersection(set(oob_samples_[itree])) == set() 107 | assert set(inbag_samples[itree]).union(set(oob_samples_[itree])) == set(samples) 108 | assert_array_equal(oob_samples_[itree], oob_samples[itree]) 109 | -------------------------------------------------------------------------------- /treeple/stats/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | import numpy as np 5 | import pytest 6 | import scipy.sparse as sp 7 | from numpy.testing import assert_array_equal 8 | 9 | import treeple.stats.utils as utils 10 | from treeple import HonestForestClassifier 11 | from treeple.stats.utils import get_per_tree_oob_samples 12 | 13 | seed = 1234 14 | rng = np.random.default_rng(seed) 15 | 16 | 17 | @pytest.mark.parametrize("bootstrap", [True, False]) 18 | def test_get_per_tree_oob_samples(bootstrap): 19 | n_estimators = 5 20 | est = HonestForestClassifier(n_estimators=n_estimators, random_state=0, bootstrap=bootstrap) 21 | 22 | X = rng.normal(0, 1, (100, 2)) 23 | X[:50] *= -1 24 | y = [0, 1] * 50 25 | samples = np.arange(len(y)) 26 | est.fit(X, y) 27 | 28 | if bootstrap: 29 | inbag_samples = est.estimators_samples_ 30 | oob_samples = [ 31 | [idx for idx in samples if idx not in inbag_samples[jdx]] for jdx in range(n_estimators) 32 | ] 33 | oob_samples_ = get_per_tree_oob_samples(est) 34 | for itree in range(n_estimators): 35 | assert len(oob_samples[itree]) > 1 36 | assert_array_equal(oob_samples_[itree], oob_samples[itree]) 37 | else: 38 | with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"): 39 | get_per_tree_oob_samples(est) 40 | 41 | 42 | @pytest.mark.parametrize("use_bottleneck", [True, False]) 43 | def test_non_nan_samples(use_bottleneck: bool): 44 | if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ: 45 | del os.environ[utils.DISABLE_BN_ENV_VAR] 46 | importlib.reload(utils) 47 | else: 48 | os.environ[utils.DISABLE_BN_ENV_VAR] = "1" 49 | importlib.reload(utils) 50 | 51 | posterior_array = np.array( 52 | [ 53 | # tree 1 54 | [ 55 | [0, 1], 56 | [np.nan, np.nan], 57 | [np.nan, np.nan], 58 | ], 59 | # tree 2 60 | [ 61 | [0, 1], 62 | [np.nan, np.nan], 63 | [1, 0], 64 | ], 65 | ] 66 | ) # [2, 3, 2] 67 | 68 | expected = np.array([0, 2]) 69 | actual = utils._non_nan_samples(posterior_array) 70 | np.testing.assert_array_equal(expected, actual) 71 | 72 | 73 | @pytest.mark.parametrize("use_bottleneck", [True, False]) 74 | def test_nanmean_f(use_bottleneck: bool): 75 | if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ: 76 | del os.environ[utils.DISABLE_BN_ENV_VAR] 77 | importlib.reload(utils) 78 | else: 79 | os.environ[utils.DISABLE_BN_ENV_VAR] = "1" 80 | importlib.reload(utils) 81 | 82 | posterior_array = np.array( 83 | [ 84 | [1, 2, np.nan], 85 | [3, 4, np.nan], 86 | ] 87 | ) 88 | 89 | expected = np.array([1.5, 3.5]) 90 | actual = utils.nanmean_f(posterior_array, axis=1) 91 | np.testing.assert_array_equal(expected, actual) 92 | 93 | 94 | @pytest.mark.parametrize( 95 | ("forest_indices", "expected"), 96 | [ 97 | (np.arange(3), np.array([0.375, 0.75, 0.25])), 98 | (np.arange(3) + 2, np.array([0.10, 0.05, 0.25])), 99 | (np.arange(3) + 3, np.array([0.10, 0.45, np.nan])), 100 | ], 101 | ) 102 | def test_get_forest_preds_sparse( 103 | forest_indices, 104 | expected, 105 | ): 106 | 107 | all_y_pred = sp.csc_matrix( 108 | np.array( 109 | [ 110 | [0.50, 0.00, 0.00], 111 | [0.25, 0.75, 0.00], 112 | [0.00, 0.00, 0.25], 113 | [0.10, 0.00, 0.00], 114 | [0.00, 0.05, 0.00], 115 | [0.00, 0.85, 0.00], 116 | ] 117 | ) 118 | ) 119 | 120 | all_y_indicator = sp.csc_matrix( 121 | np.array( 122 | [ 123 | [1, 0, 0], 124 | [1, 1, 0], 125 | [0, 0, 1], 126 | [1, 0, 0], 127 | [0, 1, 0], 128 | [0, 1, 0], 129 | ] 130 | ) 131 | ) 132 | 133 | np.testing.assert_array_equal( 134 | utils._get_forest_preds_sparse(all_y_pred, all_y_indicator, forest_indices), 135 | expected, 136 | ) 137 | -------------------------------------------------------------------------------- /treeple/tests/test_unsupervised_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn import datasets 4 | from sklearn.cluster import AgglomerativeClustering 5 | from sklearn.datasets import make_blobs 6 | from sklearn.metrics import adjusted_rand_score 7 | from sklearn.utils.estimator_checks import parametrize_with_checks 8 | 9 | from treeple.ensemble import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest 10 | 11 | CLUSTER_CRITERIONS = ("twomeans", "fastbic") 12 | 13 | FOREST_CLUSTERS = { 14 | "UnsupervisedRandomForest": UnsupervisedRandomForest, 15 | "UnsupervisedObliqueRandomForest": UnsupervisedObliqueRandomForest, 16 | } 17 | 18 | # load iris dataset 19 | iris = datasets.load_iris() 20 | rng = np.random.RandomState(1) 21 | perm = rng.permutation(iris.target.size) 22 | iris.data = iris.data[perm] 23 | iris.target = iris.target[perm] 24 | 25 | 26 | @parametrize_with_checks( 27 | [ 28 | UnsupervisedRandomForest(random_state=12345, n_estimators=50), 29 | UnsupervisedObliqueRandomForest(random_state=12345, n_estimators=50), 30 | ] 31 | ) 32 | def test_sklearn_compatible_estimator(estimator, check): 33 | if check.func.__name__ in [ 34 | # Cannot apply agglomerative clustering on < 2 samples 35 | "check_methods_subset_invariance", 36 | # sample weights do not necessarily imply a sample is not used in clustering 37 | "check_sample_weight_equivalence", 38 | "check_sample_weight_equivalence_on_dense_data", 39 | "check_sample_weight_equivalence_on_sparse_data", 40 | # sample order is not preserved in predict 41 | "check_methods_sample_order_invariance", 42 | ]: 43 | pytest.skip() 44 | check(estimator) 45 | 46 | 47 | @pytest.mark.parametrize("name, forest", FOREST_CLUSTERS.items()) 48 | @pytest.mark.parametrize("criterion", CLUSTER_CRITERIONS) 49 | def test_check_simulation(name, forest, criterion): 50 | n_samples = 200 51 | n_classes = 2 52 | 53 | # 54 | if name == "UnsupervisedRandomForest": 55 | n_features = 5 56 | if criterion == "twomeans": 57 | expected_score = 0.05 58 | elif criterion == "fastbic": 59 | expected_score = 0.35 60 | else: 61 | n_features = 20 62 | 63 | # in the forest setting, we can overfit the training dataset perfectly 64 | expected_score = 1.0 65 | X, y = make_blobs( 66 | n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=12345 67 | ) 68 | 69 | clf = forest(criterion=criterion, random_state=12345) 70 | clf.fit(X) 71 | sim_mat = clf.compute_similarity_matrix(X) 72 | 73 | # all ones along the diagonal 74 | assert np.array_equal(sim_mat.diagonal(), np.ones(n_samples)) 75 | 76 | cluster = AgglomerativeClustering(n_clusters=n_classes).fit(sim_mat) 77 | predict_labels = cluster.fit_predict(sim_mat) 78 | score = adjusted_rand_score(y, predict_labels) 79 | 80 | # XXX: This should be > 0.9 according to the UReRF. However, that could be because they used 81 | # the oblique projections by default 82 | assert ( 83 | score >= expected_score 84 | ), f"{name}-blobs failed with criterion {criterion} and score = {score}" 85 | 86 | 87 | @pytest.mark.parametrize("name, forest", FOREST_CLUSTERS.items()) 88 | @pytest.mark.parametrize("criterion", CLUSTER_CRITERIONS) 89 | def test_check_iris(name, forest, criterion): 90 | # Check consistency on dataset iris. 91 | n_classes = 3 92 | est = forest(criterion=criterion, random_state=12345) 93 | est.fit(iris.data, iris.target) 94 | sim_mat = est.compute_similarity_matrix(iris.data) 95 | 96 | if criterion == "twomeans": 97 | if "oblique" in name.lower(): 98 | expected_score = 0.21 99 | else: 100 | expected_score = 0.2 101 | elif criterion == "fastbic": 102 | if "oblique" in name.lower(): 103 | expected_score = 0.55 104 | else: 105 | expected_score = 0.3 106 | 107 | cluster = AgglomerativeClustering(n_clusters=n_classes).fit(sim_mat) 108 | predict_labels = cluster.fit_predict(sim_mat) 109 | score = adjusted_rand_score(iris.target, predict_labels) 110 | 111 | # Two-means and fastBIC criterions perform similarly here 112 | assert ( 113 | score > expected_score 114 | ), f"{name}-iris failed with criterion {criterion} and score = {score}" 115 | -------------------------------------------------------------------------------- /treeple/meson.build: -------------------------------------------------------------------------------- 1 | # Platform detection 2 | is_windows = host_machine.system() == 'windows' 3 | is_mingw = is_windows and cc.get_id() == 'gcc' 4 | 5 | c_args = [] 6 | cython_c_args = [] 7 | if is_windows 8 | # For mingw-w64, link statically against the UCRT. 9 | gcc_link_args = ['-lucrt', '-static'] 10 | if is_mingw 11 | add_project_link_arguments(gcc_link_args, language: ['c', 'cpp']) 12 | # Force gcc to float64 long doubles for compatibility with MSVC 13 | # builds, for C only. 14 | add_project_arguments('-mlong-double-64', language: 'c') 15 | # Make fprintf("%zd") work (see https://github.com/rgommers/scipy/issues/118) 16 | add_project_arguments('-D__USE_MINGW_ANSI_STDIO=1', language: ['c', 'cpp']) 17 | # Manual add of MS_WIN64 macro when not using MSVC. 18 | # https://bugs.python.org/issue28267 19 | bitness = run_command( 20 | '_build_utils/gcc_build_bitness.py', 21 | check: true 22 | ).stdout().strip() 23 | if bitness == '64' 24 | add_project_arguments('-DMS_WIN64', language: ['c', 'cpp']) 25 | endif 26 | # Silence warnings emitted by PyOS_snprintf for (%zd), see 27 | # https://github.com/rgommers/scipy/issues/118. 28 | # Use as c_args for extensions containing Cython code 29 | c_args += ['-Wno-format-extra-args', '-Wno-format'] 30 | endif 31 | endif 32 | 33 | openmp_dep = dependency('OpenMP', language: 'c', required: false) 34 | 35 | if not openmp_dep.found() 36 | warning( 37 | ''' 38 | *********** 39 | * WARNING * 40 | *********** 41 | 42 | It seems that treeple cannot be built with OpenMP. 43 | 44 | - Make sure you have followed the installation instructions: 45 | 46 | https://scikit-learn.org/dev/developers/advanced_installation.html 47 | 48 | - If your compiler supports OpenMP but you still see this 49 | message, please submit a bug report at: 50 | 51 | https://github.com/treeple/treeple/issues 52 | 53 | - The build will continue with OpenMP-based parallelism 54 | disabled. Note however that some estimators will run in 55 | sequential mode instead of leveraging thread-based 56 | parallelism. 57 | 58 | *** 59 | ''') 60 | endif 61 | 62 | # NumPy include directory - needed in all submodules 63 | incdir_numpy = meson.get_external_property('numpy-include-dir', 'not-given') 64 | if incdir_numpy == 'not-given' 65 | incdir_numpy = run_command(py, 66 | [ 67 | '-c', 68 | ''' 69 | import os 70 | import numpy as np 71 | try: 72 | incdir = os.path.relpath(np.get_include()) 73 | except Exception: 74 | incdir = np.get_include() 75 | print(incdir) 76 | ''' 77 | ], 78 | check: true 79 | ).stdout().strip() 80 | endif 81 | 82 | inc_np = include_directories(incdir_numpy) 83 | # Don't use the deprecated NumPy C API. Define this to a fixed version instead of 84 | # NPY_API_VERSION in order not to break compilation for released SciPy versions 85 | # when NumPy introduces a new deprecation. 86 | numpy_no_deprecated_api = ['-DNPY_NO_DEPRECATED_API=NPY_1_9_API_VERSION'] 87 | np_dep = declare_dependency(include_directories: inc_np, compile_args: numpy_no_deprecated_api) 88 | 89 | cc = meson.get_compiler('c') 90 | 91 | # Don't use the deprecated NumPy C API. Define this to a fixed version instead of 92 | # NPY_API_VERSION in order not to break compilation for released versions 93 | # when NumPy introduces a new deprecation. Use in a meson.build file:: 94 | # 95 | # py.extension_module('_name', 96 | # 'source_fname', 97 | # numpy_nodepr_api) 98 | 99 | # TODO XXX: ENABLE WHEN DEBUGGING 100 | boundscheck = 'False' 101 | 102 | scikit_learn_cython_args = [ 103 | '-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False', 104 | '-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True', 105 | '-X profile=False', 106 | '-X embedsignature=True', 107 | # Needed for cython imports across subpackages, e.g. cluster pyx that 108 | # cimports metrics pxd 109 | '--include-dir', meson.global_build_root(), 110 | ] 111 | cython_c_args += scikit_learn_cython_args 112 | 113 | python_sources = [ 114 | '__init__.py', 115 | 'neighbors.py', 116 | 'conftest.py', 117 | ] 118 | 119 | py.install_sources( 120 | python_sources, 121 | subdir: 'treeple' 122 | ) 123 | 124 | subdir('_lib') 125 | subdir('ensemble') 126 | subdir('experimental') 127 | subdir('stats') 128 | subdir('tests') 129 | subdir('tree') 130 | subdir('datasets') 131 | -------------------------------------------------------------------------------- /treeple/experimental/tests/test_sdf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn import datasets 4 | from sklearn.metrics import accuracy_score, r2_score 5 | from sklearn.utils.estimator_checks import parametrize_with_checks 6 | 7 | from treeple.experimental import StreamDecisionForest 8 | 9 | CLF_CRITERIONS = ("gini", "entropy") 10 | 11 | # also load the iris dataset 12 | # and randomly permute it 13 | iris = datasets.load_iris() 14 | rng = np.random.RandomState(1) 15 | perm = rng.permutation(iris.target.size) 16 | iris.data = iris.data[perm] 17 | iris.target = iris.target[perm] 18 | 19 | 20 | def test_toy_accuracy(): 21 | clf = StreamDecisionForest(n_estimators=10) 22 | X = np.ones((20, 4)) 23 | X[10:] *= -1 24 | y = [0] * 10 + [1] * 10 25 | clf = clf.fit(X, y) 26 | np.testing.assert_array_equal(clf.predict(X), y) 27 | 28 | 29 | def test_first_fit(): 30 | clf = StreamDecisionForest(n_estimators=10) 31 | with pytest.raises( 32 | ValueError, match="classes must be passed on the first call to partial_fit." 33 | ): 34 | clf.partial_fit(iris.data, iris.target) 35 | 36 | 37 | @pytest.mark.parametrize("criterion", ["gini", "entropy"]) 38 | @pytest.mark.parametrize("max_features", [None, 2]) 39 | def test_iris(criterion, max_features): 40 | # Check consistency on dataset iris. 41 | clf = StreamDecisionForest( 42 | criterion=criterion, 43 | random_state=0, 44 | max_features=max_features, 45 | n_estimators=10, 46 | ) 47 | 48 | clf.partial_fit(iris.data, iris.target, classes=np.unique(iris.target)) 49 | score = accuracy_score(clf.predict(iris.data), iris.target) 50 | 51 | assert score > 0.5 and score <= 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( 52 | "SDF", criterion, score 53 | ) 54 | 55 | score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1)) 56 | assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( 57 | "SDF", criterion, score 58 | ) 59 | 60 | clf.partial_fit(iris.data, iris.target) 61 | score = accuracy_score(clf.predict(iris.data), iris.target) 62 | 63 | assert ( 64 | score > 0.5 and score <= 1.0 65 | ), "Failed partial_fit with {0}, criterion = {1} and score = {2}".format( 66 | "SDF", criterion, score 67 | ) 68 | 69 | score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1)) 70 | assert score == 1.0, "Failed partial_fit with {0}, criterion = {1} and score = {2}".format( 71 | "SDF", criterion, score 72 | ) 73 | 74 | 75 | @pytest.mark.parametrize("criterion", ["gini", "entropy"]) 76 | @pytest.mark.parametrize("max_features", [None, 2]) 77 | def test_iris_multi(criterion, max_features): 78 | # Check consistency on dataset iris. 79 | clf = StreamDecisionForest( 80 | criterion=criterion, 81 | random_state=0, 82 | max_features=max_features, 83 | n_estimators=10, 84 | ) 85 | 86 | second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)]) 87 | 88 | X = iris.data 89 | y = np.stack((iris.target, second_y[perm])).T 90 | 91 | clf.fit(X, y) 92 | score = r2_score(clf.predict(X), y) 93 | assert score > 0.9 and score <= 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( 94 | "SDF", criterion, score 95 | ) 96 | 97 | 98 | def test_max_samples(): 99 | max_samples_list = [8, 0.5, None] 100 | depths = [] 101 | X = rng.normal(0, 1, (100, 2)) 102 | X[:50] *= -1 103 | y = [0, 1] * 50 104 | for ms in max_samples_list: 105 | uf = StreamDecisionForest(n_estimators=2, random_state=0, max_samples=ms, bootstrap=True) 106 | uf = uf.fit(X, y) 107 | depths.append(uf.estimators_[0].get_depth()) 108 | 109 | assert all(np.diff(depths) > 0) 110 | 111 | 112 | @parametrize_with_checks([StreamDecisionForest(n_estimators=10, random_state=0)]) 113 | def test_sklearn_compatible_estimator(estimator, check): 114 | # 1. check_class_weight_classifiers is not supported since it requires sample weight 115 | # XXX: can include this "generalization" in the future if it's useful 116 | if check.func.__name__ in [ 117 | "check_class_weight_classifiers", 118 | "check_sample_weight_equivalence", 119 | "check_sample_weight_equivalence_on_dense_data", 120 | "check_sample_weight_equivalence_on_sparse_data", 121 | ]: 122 | pytest.skip() 123 | check(estimator) 124 | -------------------------------------------------------------------------------- /examples/sparse_oblique_trees/plot_oblique_random_forest.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============================================================================== 3 | Plot oblique forest and axis-aligned random forest predictions on cc18 datasets 4 | =============================================================================== 5 | 6 | A performance comparison between oblique forest and standard axis- 7 | aligned random forest using three datasets from OpenML benchmarking suites. 8 | 9 | Two of these datasets, namely 10 | [WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510) 11 | and [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534) 12 | datasets consist of 31 features where the former dataset is entirely numeric 13 | and the latter dataset is entirely norminal. The third dataset, dubbed 14 | [cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a 15 | numeric dataset that has notably large feature space of 857 features. As you 16 | will notice, of these three datasets, the oblique forest outperforms axis-aligned 17 | random forest on cnae-9 utilizing sparse random projection mechanism. All datasets 18 | are subsampled due to computational constraints. 19 | 20 | For an example of using extra-oblique trees/forests in practice on data, see the following 21 | example :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_oblique_random_forest.py`. 22 | """ 23 | 24 | from datetime import datetime 25 | 26 | import matplotlib.pyplot as plt 27 | import pandas as pd 28 | import seaborn as sns 29 | from sklearn.datasets import fetch_openml 30 | from sklearn.ensemble import RandomForestClassifier 31 | from sklearn.model_selection import RepeatedKFold, cross_validate 32 | 33 | from treeple import ObliqueRandomForestClassifier 34 | 35 | random_state = 123456 36 | t0 = datetime.now() 37 | data_ids = [4534, 1510, 1468] # openml dataset id 38 | df = pd.DataFrame() 39 | 40 | 41 | def load_cc18(data_id): 42 | df = fetch_openml(data_id=data_id, as_frame=True, parser="pandas") 43 | 44 | # extract the dataset name 45 | d_name = df.details["name"] 46 | 47 | # Subsampling large datasets 48 | if data_id == 1468: 49 | n = 100 50 | else: 51 | n = int(df.frame.shape[0] * 0.8) 52 | 53 | df = df.frame.sample(n, random_state=random_state) 54 | X, y = df.iloc[:, :-1], df.iloc[:, -1] 55 | 56 | return X, y, d_name 57 | 58 | 59 | def get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs): 60 | clfs = [RandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)] 61 | 62 | tmp = [] 63 | 64 | for i, clf in enumerate(clfs): 65 | cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs["random_state"]) 66 | test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy") 67 | 68 | tmp.append( 69 | [ 70 | d_name, 71 | ["RF", "OF"][i], 72 | test_score["test_score"], 73 | test_score["test_score"].mean(), 74 | ] 75 | ) 76 | 77 | df = pd.DataFrame( 78 | tmp, columns=["dataset", "model", "score", "mean"] 79 | ) # dtype=[('model',object), ('score',float), ('mean',float)]) 80 | df = df.explode("score") 81 | df["score"] = df["score"].astype(float) 82 | df.reset_index(inplace=True, drop=True) 83 | 84 | return df 85 | 86 | 87 | params = { 88 | "max_features": None, 89 | "n_estimators": 50, 90 | "max_depth": None, 91 | "random_state": random_state, 92 | "n_cv": 2, 93 | "n_repeats": 1, 94 | } 95 | 96 | for data_id in data_ids: 97 | X, y, d_name = load_cc18(data_id=data_id) 98 | print(f"Loading [{d_name}] dataset..") 99 | tmp = get_scores(X=X, y=y, d_name=d_name, **params) 100 | df = pd.concat([df, tmp]) 101 | 102 | print(f"It took {(datetime.now()-t0).seconds} seconds to run the script") 103 | 104 | # Draw a comparison plot 105 | d_names = df.dataset.unique() 106 | N = d_names.shape[0] 107 | 108 | fig, ax = plt.subplots(1, N) 109 | fig.set_size_inches(6 * N, 6) 110 | 111 | for i, name in enumerate(d_names): 112 | sns.stripplot( 113 | data=df.query(f'dataset == "{name}"'), 114 | x="model", 115 | y="score", 116 | ax=ax[i], 117 | dodge=True, 118 | ) 119 | sns.boxplot( 120 | data=df.query(f'dataset == "{name}"'), 121 | x="model", 122 | y="score", 123 | ax=ax[i], 124 | color="white", 125 | ) 126 | ax[i].set_title(name) 127 | if i != 0: 128 | ax[i].set_ylabel("") 129 | ax[i].set_xlabel("") 130 | -------------------------------------------------------------------------------- /examples/treeple/treeple_tutorial_1_1a_SA98.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================ 3 | Calculating S@98 4 | ================ 5 | """ 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import seaborn as sns 10 | from sklearn.metrics import roc_curve 11 | 12 | from treeple.datasets import make_trunk_classification 13 | from treeple.ensemble import HonestForestClassifier 14 | from treeple.stats import build_oob_forest 15 | 16 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5) 17 | PALETTE = sns.color_palette("Set1") 18 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9) 19 | sns.set_style("white", {"axes.edgecolor": "#dddddd"}) 20 | 21 | # %% 22 | # S@98 23 | # ---- 24 | # 25 | # Sensitivity at 98% specificity (*S@98*) measures, namely, the true 26 | # positive rate (*TPR*) when the false positive rate (*FPR*) is at 98%. 27 | # 28 | # .. math:: S@r = \mathbb{P}[\eta(X) > T_r \mid Y=1] 29 | # 30 | # With a binary class simulation as an example, this tutorial will show 31 | # how to use ``treeple`` to calculate the statistic. 32 | 33 | # %% 34 | # Create a simulation with two gaussians 35 | # -------------------------------------- 36 | 37 | 38 | # create a binary class simulation with two gaussians 39 | # 500 samples for each class, class zero is standard 40 | # gaussian, and class one has a mean at one 41 | X, y = make_trunk_classification( 42 | n_samples=1000, 43 | n_dim=1, 44 | mu_0=0, 45 | mu_1=1, 46 | n_informative=1, 47 | seed=1, 48 | ) 49 | 50 | 51 | fig, ax = plt.subplots(figsize=(6, 6)) 52 | fig.tight_layout() 53 | ax.tick_params(labelsize=15) 54 | 55 | # histogram plot the samples 56 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative") 57 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive") 58 | ax.set_xlabel("Variable One", fontsize=15) 59 | ax.set_ylabel("Likelihood", fontsize=15) 60 | plt.legend(frameon=False, fontsize=15) 61 | plt.show() 62 | 63 | # %% 64 | # Fit the model 65 | # ------------- 66 | 67 | 68 | # initialize the forest with 100 trees 69 | est = HonestForestClassifier( 70 | n_estimators=100, 71 | max_samples=1.6, 72 | max_features=0.3, 73 | bootstrap=True, 74 | stratify=True, 75 | random_state=1, 76 | ) 77 | 78 | # fit the model and obtain the tree posteriors 79 | _, observe_proba = build_oob_forest(est, X, y) 80 | 81 | # generate forest posteriors for the two classes 82 | observe_proba = np.nanmean(observe_proba, axis=0) 83 | 84 | 85 | fig, ax = plt.subplots(figsize=(6, 6)) 86 | fig.tight_layout() 87 | ax.tick_params(labelsize=15) 88 | 89 | # histogram plot the posterior probabilities for class one 90 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative") 91 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive") 92 | ax.set_ylabel("# of Samples", fontsize=15) 93 | ax.set_xlabel("Class One Posterior", fontsize=15) 94 | plt.legend(frameon=False, fontsize=15) 95 | plt.show() 96 | 97 | # %% 98 | # Calculate the statistic 99 | # ----------------------- 100 | 101 | 102 | def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float: 103 | """Calculate the sensitivity at a specific specificity""" 104 | # check the shape of true labels 105 | if y_true.squeeze().ndim != 1: 106 | raise ValueError(f"y_true must be 1d, not {y_true.shape}") 107 | 108 | # find the positive class and calculate fpr and tpr 109 | if 0 in y_true or -1 in y_true: 110 | fpr, tpr, thresholds = roc_curve( 111 | y_true, y_pred_proba[:, 1], pos_label=1, drop_intermediate=False 112 | ) 113 | else: 114 | fpr, tpr, thresholds = roc_curve( 115 | y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False 116 | ) 117 | sa98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr]) 118 | 119 | fig, ax = plt.subplots(figsize=(6, 6)) 120 | fig.tight_layout() 121 | ax.tick_params(labelsize=15) 122 | ax.set_xlim([-0.005, 1.005]) 123 | ax.set_ylim([-0.005, 1.005]) 124 | ax.set_xlabel("False Positive Rate", fontsize=15) 125 | ax.set_ylabel("True Positive Rate", fontsize=15) 126 | 127 | ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1]) 128 | 129 | spec = int((1 - max_fpr) * 100) 130 | ax.axvline( 131 | x=max_fpr, 132 | color=PALETTE[0], 133 | ymin=0, 134 | ymax=sa98, 135 | label="S@" + str(spec) + " = " + str(round(sa98, 2)), 136 | linestyle="--", 137 | ) 138 | ax.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--") 139 | ax.legend(frameon=False, fontsize=15) 140 | 141 | return sa98 142 | 143 | 144 | sa98 = Calculate_SA(y, observe_proba, max_fpr=0.02) 145 | print("S@98 =", round(sa98, 2)) 146 | # sphinx_gallery_thumbnail_number = -1 147 | -------------------------------------------------------------------------------- /examples/calibration/plot_honest_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | =========================================== 3 | Comparison of Decision Tree and Honest Tree 4 | =========================================== 5 | 6 | This example compares the :class:`treeple.tree.HonestTreeClassifier` from the 7 | ``treeple`` library with the :class:`sklearn.tree.DecisionTreeClassifier` 8 | from scikit-learn on the Iris dataset. 9 | 10 | Both classifiers are fitted on the same dataset and their decision trees 11 | are plotted side by side. 12 | """ 13 | 14 | import matplotlib.pyplot as plt 15 | from sklearn import config_context 16 | from sklearn.datasets import load_iris 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.tree import DecisionTreeClassifier, plot_tree 19 | 20 | from treeple.tree import HonestTreeClassifier 21 | 22 | # Load the iris dataset 23 | iris = load_iris() 24 | X, y = iris.data, iris.target 25 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=0) 26 | 27 | # Initialize classifiers 28 | max_features = 0.3 29 | 30 | dishonest_clf = HonestTreeClassifier( 31 | honest_method=None, 32 | max_features=max_features, 33 | random_state=0, 34 | honest_prior="ignore", 35 | ) 36 | honest_noprune_clf = HonestTreeClassifier( 37 | honest_method="apply", 38 | max_features=max_features, 39 | random_state=0, 40 | honest_prior="ignore", 41 | ) 42 | honest_clf = HonestTreeClassifier(honest_method="prune", max_features=max_features, random_state=0) 43 | sklearn_clf = DecisionTreeClassifier(max_features=max_features, random_state=0) 44 | 45 | # Fit classifiers 46 | dishonest_clf.fit(X_train, y_train) 47 | honest_noprune_clf.fit(X_train, y_train) 48 | honest_clf.fit(X_train, y_train) 49 | sklearn_clf.fit(X_train, y_train) 50 | 51 | # Plotting the trees 52 | fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5)) 53 | 54 | # .. note:: We skip parameter validation because internally the `plot_tree` 55 | # function checks if the estimator is a DecisionTreeClassifier 56 | # instance from scikit-learn, but the ``HonestTreeClassifier`` is 57 | # a subclass of a forked version of the DecisionTreeClassifier. 58 | 59 | # Plot HonestTreeClassifier tree 60 | ax = axes[2] 61 | with config_context(skip_parameter_validation=True): 62 | plot_tree(honest_clf, filled=True, ax=ax) 63 | ax.set_title("HonestTreeClassifier") 64 | 65 | # Plot HonestTreeClassifier tree 66 | ax = axes[1] 67 | with config_context(skip_parameter_validation=True): 68 | plot_tree(honest_noprune_clf, filled=False, ax=ax) 69 | ax.set_title("HonestTreeClassifier (No pruning)") 70 | 71 | # Plot HonestTreeClassifier tree 72 | ax = axes[0] 73 | with config_context(skip_parameter_validation=True): 74 | plot_tree(dishonest_clf, filled=False, ax=ax) 75 | ax.set_title("HonestTreeClassifier (Dishonest)") 76 | 77 | 78 | # Plot scikit-learn DecisionTreeClassifier tree 79 | plot_tree(sklearn_clf, filled=True, ax=axes[3]) 80 | axes[3].set_title("DecisionTreeClassifier") 81 | 82 | plt.show() 83 | 84 | # %% 85 | # Discussion 86 | # ---------- 87 | # The HonestTreeClassifier is a variant of the DecisionTreeClassifier that 88 | # provides honest inference. The honest inference is achieved by splitting the 89 | # dataset into two parts: the training set and the validation set. The training 90 | # set is used to build the tree, while the validation set is used to fit the 91 | # leaf nodes for posterior prediction. This results in calibrated posteriors 92 | # (see :ref:`sphx_glr_auto_examples_calibration_plot_overlapping_gaussians.py`). 93 | # 94 | # Compared to the ``honest_prior='apply'`` method, the ``honest_prior='prune'`` 95 | # method builds a tree that will not contain empty leaves, and also leverages 96 | # the validation set to check split conditions. Thus we see that the pruned 97 | # honest tree is significantly smaller than the regular decision tree. 98 | 99 | # %% 100 | # Evaluate predictions of the trees 101 | # --------------------------------- 102 | # When we do not prune, note that the honest tree will have empty leaves 103 | # that predict the prior. In this case, ``honest_prior='ignore'`` is used 104 | # to ignore these leaves when computing the posteriors, which will result 105 | # in a posterior that is ``np.nan``. 106 | 107 | # this is the same as a decision tree classifier that is trained on less data 108 | print("\nDishonest posteriors: ", dishonest_clf.predict_proba(X_val)) 109 | 110 | # this is the honest tree with empty leaves that predict the prior 111 | print("\nHonest tree without pruning: ", honest_noprune_clf.predict_proba(X_val)) 112 | 113 | # this is the honest tree that is pruned 114 | print("\nHonest tree with pruning: ", honest_clf.predict_proba(X_val)) 115 | 116 | # this is a regular decision tree classifier from sklearn 117 | print("\nDTC: ", sklearn_clf.predict_proba(X_val)) 118 | --------------------------------------------------------------------------------