├── treeple
    ├── _lib
    │   ├── __init__.py
    │   └── meson.build
    ├── tests
    │   ├── __init__.py
    │   ├── meson.build
    │   ├── test_neighbors.py
    │   ├── test_extensions.py
    │   └── test_unsupervised_forest.py
    ├── stats
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── meson.build
    │   │   ├── test_permuteforest.py
    │   │   ├── test_baseline.py
    │   │   └── test_utils.py
    │   ├── meson.build
    │   └── __init__.py
    ├── tree
    │   ├── honesty
    │   │   ├── __init__.py
    │   │   ├── meson.build
    │   │   └── _honest_prune.pxd
    │   ├── manifold
    │   │   ├── __init__.py
    │   │   ├── meson.build
    │   │   └── _morf_splitter.pxd
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── meson.build
    │   │   └── test_honest_prune.py
    │   ├── unsupervised
    │   │   ├── __init__.py
    │   │   ├── meson.build
    │   │   ├── _unsup_oblique_tree.pxd
    │   │   ├── _unsup_splitter.pxd
    │   │   ├── _unsup_criterion.pxd
    │   │   ├── _unsup_tree.pxd
    │   │   └── _unsup_oblique_splitter.pxd
    │   ├── _sklearn_splitter.pxd
    │   ├── _marginal.pxd
    │   ├── kernels.py
    │   ├── _utils.pxd
    │   ├── __init__.py
    │   ├── meson.build
    │   ├── _oblique_tree.pxd
    │   └── _neighbors.py
    ├── datasets
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── meson.build
    │   ├── meson.build
    │   └── __init__.py
    ├── experimental
    │   ├── distributions.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── meson.build
    │   │   ├── test_simulate.py
    │   │   ├── test_mutual_info.py
    │   │   └── test_sdf.py
    │   ├── meson.build
    │   └── __init__.py
    ├── conftest.py
    ├── ensemble
    │   ├── meson.build
    │   └── __init__.py
    ├── _build_utils
    │   └── gcc_build_bitness.py
    ├── __init__.py
    └── meson.build
├── .codespellignore
├── benchmarks
    ├── __init__.py
    ├── utils.py
    ├── config.json
    └── ensemble_supervised.py
├── doc
    ├── sphinxext
    │   ├── MANIFEST.in
    │   ├── doi_role.py
    │   ├── allow_nan_estimators.py
    │   └── github_link.py
    ├── _templates
    │   ├── autosummary
    │   │   ├── function.rst
    │   │   └── class.rst
    │   └── layout.html
    ├── use.rst
    ├── user_guide.rst
    ├── whats_new
    │   ├── changelog_legend.inc
    │   ├── v0.10.rst
    │   ├── v0.9.rst
    │   ├── v0.5.rst
    │   ├── _contributors.rst
    │   ├── v0.3.rst
    │   ├── v0.8.rst
    │   ├── v0.4.rst
    │   ├── v0.2.rst
    │   ├── v0.1.rst
    │   ├── v0.6.rst
    │   └── v0.7.rst
    ├── whats_new.rst
    ├── make.bat
    ├── _static
    │   ├── style.css
    │   └── versions.json
    ├── install.rst
    ├── index.rst
    └── modules
    │   ├── unsupervised_tree.rst
    │   └── ensemble.rst
├── examples
    ├── README.txt
    ├── outlier_detection
    │   └── README.txt
    ├── splitters
    │   └── README.txt
    ├── multiview
    │   └── README.txt
    ├── calibration
    │   ├── README.txt
    │   └── plot_honest_tree.py
    ├── quantile_predictions
    │   ├── README.txt
    │   ├── plot_quantile_vs_standard_oblique_forest.py
    │   ├── plot_quantile_toy_example_with_RF.py
    │   └── plot_quantile_interpolation_with_RF.py
    ├── sklearn_vs_treeple
    │   ├── README.txt
    │   └── plot_iris_dtc.py
    ├── sparse_oblique_trees
    │   ├── README.txt
    │   ├── plot_oblique_axis_aligned_forests_sparse_parity.py
    │   └── plot_oblique_random_forest.py
    └── treeple
    │   ├── README.txt
    │   ├── treeple_tutorial_1_1d_HD.py
    │   ├── treeple_tutorial_1_1b_MI.py
    │   └── treeple_tutorial_1_1a_SA98.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── blank.md
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── dependabot.yml
    ├── label-globs.yml
    ├── workflows
    │   ├── pull_request_labeler.yml
    │   ├── cffconvert.yml
    │   ├── circle_artifacts.yml
    │   ├── style.yml
    │   ├── release.yml
    │   └── pr_checks.yml
    ├── FUNDING.yml
    └── PULL_REQUEST_TEMPLATE.md
├── test_requirements.txt
├── benchmarks_nonasv
    ├── README.md
    └── bench_plot_urf.py
├── .gitmodules
├── style_requirements.txt
├── .yamllint.yml
├── spin
├── .flake8
├── .gitignore
├── CITATION.cff
├── .pre-commit-config.yaml
├── meson.build
└── Makefile


/treeple/_lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/stats/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/tree/honesty/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/tree/manifold/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/tree/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/datasets/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/experimental/distributions.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/experimental/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.codespellignore:
--------------------------------------------------------------------------------
1 | raison
2 | nd
3 | parth
4 | ot
5 | fpr
6 | master


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | """Benchmark suite for treeple using ASV"""
2 | 


--------------------------------------------------------------------------------
/doc/sphinxext/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *.py
2 | include *.txt
3 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | Examples
2 | ========
3 | 
4 | Examples demonstrating how to use treeple algorithms.
5 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/blank.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Blank issue
3 | about: Create an issue without a template.
4 | 
5 | ---
6 | 


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | pandas
3 | pytest
4 | pytest-cov
5 | memory_profiler
6 | flaky
7 | tqdm
8 | bottleneck
9 | 


--------------------------------------------------------------------------------
/benchmarks_nonasv/README.md:
--------------------------------------------------------------------------------
1 | A set of scripts that can be run to analyze runtime and performance of treeple
2 | estimators.
3 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "weekly"
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "treeple/_lib/sklearn"]
2 | 	path = treeple/_lib/sklearn_fork
3 | 	url = https://github.com/neurodata/scikit-learn
4 | 	branch = submodulev3
5 | 


--------------------------------------------------------------------------------
/examples/outlier_detection/README.txt:
--------------------------------------------------------------------------------
1 | .. _outlier_examples:
2 | 
3 | Outlier-detection
4 | -----------------
5 | 
6 | Examples concerning how to do outlier detection with decision trees.
7 | 


--------------------------------------------------------------------------------
/examples/splitters/README.txt:
--------------------------------------------------------------------------------
1 | .. _splitter_examples:
2 | 
3 | Decision-tree splitters
4 | -----------------------
5 | 
6 | Examples demonstrating different node-splitting strategies for decision trees.
7 | 


--------------------------------------------------------------------------------
/style_requirements.txt:
--------------------------------------------------------------------------------
 1 | mypy
 2 | black
 3 | isort
 4 | flake8
 5 | bandit
 6 | pydocstyle
 7 | codespell
 8 | toml
 9 | cython-lint
10 | pre-commit
11 | yamllint
12 | toml-sort
13 | ruff
14 | rstcheck
15 | 


--------------------------------------------------------------------------------
/examples/multiview/README.txt:
--------------------------------------------------------------------------------
1 | .. _multiview_examples:
2 | 
3 | Multi-view learning with Decision-trees
4 | ---------------------------------------
5 | 
6 | Examples demonstrating multi-view learning using random forest variants.
7 | 


--------------------------------------------------------------------------------
/examples/calibration/README.txt:
--------------------------------------------------------------------------------
1 | .. _calibration_examples:
2 | 
3 | Calibrated decision trees via honesty
4 | -------------------------------------
5 | 
6 | Examples demonstrating the usage of honest decision trees to obtain calibrated predictions.
7 | 


--------------------------------------------------------------------------------
/examples/quantile_predictions/README.txt:
--------------------------------------------------------------------------------
1 | .. _quantile_examples:
2 | 
3 | Quantile Predictions with Random Forest
4 | ---------------------------------------
5 | 
6 | Examples demonstrating how to generate quantile predictions using Random Forest variants.


--------------------------------------------------------------------------------
/examples/sklearn_vs_treeple/README.txt:
--------------------------------------------------------------------------------
1 | .. _sklearn_examples:
2 | 
3 | Comparing sklearn and treeple decision trees
4 | --------------------------------------------
5 | 
6 | Examples demonstrating the difference between sklearn and treeple decision trees.
7 | 


--------------------------------------------------------------------------------
/examples/sparse_oblique_trees/README.txt:
--------------------------------------------------------------------------------
1 | .. _sporf_examples:
2 | 
3 | Sparse oblique projections with oblique decision-trees
4 | ------------------------------------------------------
5 | 
6 | Examples demonstrating learning using oblique random forests.
7 | 


--------------------------------------------------------------------------------
/treeple/datasets/tests/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'test_hyppo.py',
 4 |   'test_multiview.py',
 5 | ]
 6 | 
 7 | py.install_sources(
 8 |   python_sources,
 9 |   pure: false,
10 |   subdir: 'treeple/datasets/tests'
11 | )
12 | 


--------------------------------------------------------------------------------
/treeple/datasets/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'multiview.py',
 4 |   'hyppo.py',
 5 | ]
 6 | 
 7 | py.install_sources(
 8 |   python_sources,
 9 |   pure: false,
10 |   subdir: 'treeple/datasets'
11 | )
12 | 
13 | subdir('tests')
14 | 


--------------------------------------------------------------------------------
/doc/_templates/autosummary/function.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autofunction:: {{ objname }}
 6 | 
 7 | .. _sphx_glr_backreferences_{{ fullname }}:
 8 | 
 9 | .. minigallery:: {{ fullname }}
10 |     :add-heading:
11 | 


--------------------------------------------------------------------------------
/treeple/stats/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'forest.py',
 4 |   'utils.py',
 5 |   'permuteforest.py',
 6 |   'baseline.py',
 7 | ]
 8 | 
 9 | py.install_sources(
10 |   python_sources,
11 |   pure: false,
12 |   subdir: 'treeple/stats'
13 | )
14 | 
15 | subdir('tests')
16 | 


--------------------------------------------------------------------------------
/.yamllint.yml:
--------------------------------------------------------------------------------
 1 | extends: default
 2 | 
 3 | ignore: |
 4 |   treeple/_lib/
 5 |   .asv/
 6 | 
 7 | rules:
 8 |   line-length: disable
 9 |   document-start: disable
10 |   truthy: disable
11 |   comments: disable
12 |   braces:
13 |     forbid: false
14 |     min-spaces-inside: 0
15 |     max-spaces-inside: 1
16 | 


--------------------------------------------------------------------------------
/treeple/experimental/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'mutual_info.py',
 4 |   'simulate.py',
 5 |   'sdf.py',
 6 |   'monte_carlo.py',
 7 | ]
 8 | 
 9 | py.install_sources(
10 |   python_sources,
11 |   pure: false,
12 |   subdir: 'treeple/experimental'
13 | )
14 | 
15 | subdir('tests')
16 | 


--------------------------------------------------------------------------------
/treeple/experimental/tests/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'test_mutual_info.py',
 4 |   'test_simulate.py',
 5 |   'test_sdf.py',
 6 |   'test_monte_carlo.py',
 7 | ]
 8 | 
 9 | py.install_sources(
10 |   python_sources,
11 |   pure: false,
12 |   subdir: 'treeple/experimental/tests'
13 | )
14 | 


--------------------------------------------------------------------------------
/doc/use.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | Examples using treeple
 4 | ==========================
 5 | 
 6 | To be able to effectively use treeple, look at some of the examples here
 7 | to learn everything you need!
 8 | 
 9 | .. rstcheck: ignore-next-code-block
10 | .. include:: auto_examples/index.rst
11 |    :start-after: :orphan:
12 | 


--------------------------------------------------------------------------------
/treeple/stats/tests/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'test_forest.py',
 4 |   'test_baseline.py',
 5 |   'test_coleman.py',
 6 |   'test_utils.py',
 7 |   'test_permuteforest.py',
 8 | ]
 9 | 
10 | py.install_sources(
11 |   python_sources,
12 |   pure: false,
13 |   subdir: 'treeple/stats/tests'
14 | )
15 | 


--------------------------------------------------------------------------------
/treeple/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | # With the following global module marker,
 4 | # monitoring is disabled by default:
 5 | pytestmark = [pytest.mark.monitor_skip_test]
 6 | 
 7 | 
 8 | def pytest_configure(config):
 9 |     """Set up pytest markers."""
10 |     config.addinivalue_line("markers", "slowtest: mark test as slow")
11 | 


--------------------------------------------------------------------------------
/examples/treeple/README.txt:
--------------------------------------------------------------------------------
1 | .. _treeple:
2 | 
3 | Treeple for Hypothesis Testing
4 | ------------------------------
5 | 
6 | Examples concerning how to use treeple as hypothesis test tools.
7 | Tutorials include estimating true statistics with true posterior functionss,
8 | using forest to calculate statistic estimates, and calculating p-values.
9 | 


--------------------------------------------------------------------------------
/treeple/ensemble/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   '_supervised_forest.py',
 4 |   '_unsupervised_forest.py',
 5 |   '_honest_forest.py',
 6 |   '_eiforest.py',
 7 |   '_multiview.py',
 8 |   '_extensions.py',
 9 | ]
10 | 
11 | py.install_sources(
12 |   python_sources,
13 |   pure: false,
14 |   subdir: 'treeple/ensemble'
15 | )
16 | 


--------------------------------------------------------------------------------
/.github/label-globs.yml:
--------------------------------------------------------------------------------
 1 | Cython:
 2 |   - treeple/**/*.pyx.*
 3 |   - treeple/**/*.pxd.*
 4 |   - treeple/**/*.pxi.*
 5 | 
 6 | C/C++:
 7 |   - treeple/**/*.c
 8 |   - treeple/**/*.c.in
 9 |   - treeple/**/*.c.old
10 |   - treeple/**/*.h
11 |   - treeple/**/*.h.in
12 |   - treeple/**/*.cpp
13 |   - treeple/**/*.cc
14 |   - treeple/**/*.cxx
15 |   - treeple/**/*.hpp
16 | 


--------------------------------------------------------------------------------
/doc/user_guide.rst:
--------------------------------------------------------------------------------
 1 | .. Places parent toc into the sidebar
 2 | 
 3 | :parenttoc: True
 4 | 
 5 | .. title:: User guide: contents
 6 | 
 7 | .. _user_guide:
 8 | 
 9 | ==========
10 | User Guide
11 | ==========
12 | 
13 | .. toctree::
14 |    :numbered:
15 |    :maxdepth: 3
16 | 
17 |    modules/supervised_tree
18 |    modules/unsupervised_tree
19 |    modules/ensemble
20 | 


--------------------------------------------------------------------------------
/treeple/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .hyppo import (
 2 |     approximate_clf_mutual_information,
 3 |     approximate_clf_mutual_information_with_monte_carlo,
 4 |     make_marron_wand_classification,
 5 |     make_quadratic_classification,
 6 |     make_trunk_classification,
 7 |     make_trunk_mixture_classification,
 8 | )
 9 | from .multiview import make_gaussian_mixture, make_joint_factor_model
10 | 


--------------------------------------------------------------------------------
/treeple/experimental/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import mutual_info, sdf, simulate
 2 | from .monte_carlo import conditional_resample
 3 | from .mutual_info import (
 4 |     cmi_from_entropy,
 5 |     cmi_gaussian,
 6 |     entropy_gaussian,
 7 |     entropy_weibull,
 8 |     mi_from_entropy,
 9 |     mi_gamma,
10 |     mi_gaussian,
11 |     mutual_info_ksg,
12 | )
13 | from .sdf import StreamDecisionForest
14 | 


--------------------------------------------------------------------------------
/treeple/tree/_sklearn_splitter.pxd:
--------------------------------------------------------------------------------
1 | from .._lib.sklearn.utils._typedefs cimport float32_t, int32_t, intp_t
2 | 
3 | # This defines c-importable functions for other cython files
4 | 
5 | # TODO: remove these files when sklearn merges refactor defining these in pxd files
6 | # https://github.com/scikit-learn/scikit-learn/pull/25606
7 | cdef void sort(float32_t* Xf, intp_t* samples, intp_t n) noexcept nogil
8 | 


--------------------------------------------------------------------------------
/treeple/tests/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'test_supervised_forest.py',
 4 |   'test_unsupervised_forest.py',
 5 |   'test_neighbors.py',
 6 |   'test_honest_forest.py',
 7 |   'test_eiforest.py',
 8 |   'test_multiview_forest.py',
 9 |   'test_extensions.py',
10 | ]
11 | 
12 | py.install_sources(
13 |   python_sources,
14 |   pure: false,
15 |   subdir: 'treeple/tests'
16 | )
17 | 


--------------------------------------------------------------------------------
/treeple/tree/tests/meson.build:
--------------------------------------------------------------------------------
 1 | python_sources = [
 2 |   '__init__.py',
 3 |   'test_tree.py',
 4 |   'test_utils.py',
 5 |   'test_honest_tree.py',
 6 |   'test_honest_prune.py',
 7 |   'test_marginal.py',
 8 |   'test_all_trees.py',
 9 |   'test_unsupervised_tree.py',
10 |   'test_multiview.py',
11 | ]
12 | 
13 | py.install_sources(
14 |   python_sources,
15 |   pure: false,
16 |   subdir: 'treeple/tree/tests'
17 | )


--------------------------------------------------------------------------------
/treeple/stats/__init__.py:
--------------------------------------------------------------------------------
 1 | from .baseline import build_cv_forest, build_permutation_forest
 2 | from .forest import build_coleman_forest, build_oob_forest
 3 | from .permuteforest import PermutationHonestForestClassifier
 4 | 
 5 | __all__ = [
 6 |     "build_cv_forest",
 7 |     "build_oob_forest",
 8 |     "build_coleman_forest",
 9 |     "build_permutation_forest",
10 |     "PermutationHonestForestClassifier",
11 | ]
12 | 


--------------------------------------------------------------------------------
/treeple/tree/_marginal.pxd:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | cimport numpy as cnp
 4 | 
 5 | from .._lib.sklearn.tree._tree cimport BaseTree, Node
 6 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t
 7 | 
 8 | 
 9 | cpdef apply_marginal_tree(
10 |     BaseTree tree,
11 |     object X,
12 |     const intp_t[:] marginal_indices,
13 |     intp_t traversal_method,
14 |     uint8_t use_sample_weight,
15 |     object random_state
16 | )
17 | 


--------------------------------------------------------------------------------
/treeple/tree/kernels.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def gaussian_kernel(shape, sigma=1.0, mu=0.0):
 5 |     """N-dimensional gaussian kernel for the given shape.
 6 | 
 7 |     See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e
 8 |     """
 9 |     m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape])
10 |     d = np.sqrt(np.sum([x * x for x in m], axis=0))
11 |     g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2)))
12 |     return g / np.sum(g)
13 | 


--------------------------------------------------------------------------------
/spin:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Example stub for running `python -m spin`
 4 | #
 5 | # Copy this into your project root.
 6 | 
 7 | import os
 8 | import runpy
 9 | import sys
10 | 
11 | sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
12 | try:
13 |     runpy.run_module("spin", run_name="__main__")
14 | except ImportError:
15 |     print("Cannot import spin; please install it using")
16 |     print()
17 |     print("  pip install spin")
18 |     print()
19 |     sys.exit(1)
20 | 


--------------------------------------------------------------------------------
/doc/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | ..
 2 |     The empty line below should not be removed. It is added such that the `rst_prolog`
 3 |     is added before the :mod: directive. Otherwise, the rendering will show as a
 4 |     paragraph instead of a header.
 5 | 
 6 | :mod:`{{module}}`.{{objname}}
 7 | {{ underline }}==============
 8 | 
 9 | .. currentmodule:: {{ module }}
10 | 
11 | .. autoclass:: {{ objname }}
12 | 
13 | .. _sphx_glr_backreferences_{{ fullname }}:
14 | 
15 | .. raw:: html
16 | 
17 |     <div class="clearer"></div>
18 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request_labeler.yml:
--------------------------------------------------------------------------------
 1 | name: "Pull Request Labeler"
 2 | on:
 3 |   pull_request_target:
 4 |     types: [created]
 5 | 
 6 | permissions:
 7 |   contents: write  # to add labels
 8 | 
 9 | jobs:
10 |   label_pull_request:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: thomasjpfan/labeler@v2.5.1
14 |         continue-on-error: true
15 |         if: github.repository == 'neurodata/treeple'
16 |         with:
17 |           repo-token: "${{ secrets.GITHUB_TOKEN }}"
18 |           configuration-path: ".github/label-globs.yml"
19 | 


--------------------------------------------------------------------------------
/treeple/ensemble/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._eiforest import ExtendedIsolationForest
 2 | from ._honest_forest import HonestForestClassifier
 3 | from ._multiview import MultiViewRandomForestClassifier
 4 | from ._supervised_forest import (
 5 |     ExtraObliqueRandomForestClassifier,
 6 |     ExtraObliqueRandomForestRegressor,
 7 |     ObliqueRandomForestClassifier,
 8 |     ObliqueRandomForestRegressor,
 9 |     PatchObliqueRandomForestClassifier,
10 |     PatchObliqueRandomForestRegressor,
11 | )
12 | from ._unsupervised_forest import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest
13 | 


--------------------------------------------------------------------------------
/.github/workflows/cffconvert.yml:
--------------------------------------------------------------------------------
 1 | name: cffconvert
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - CITATION.cff
 7 |   pull_request:
 8 |     paths:
 9 |       - CITATION.cff
10 | 
11 | jobs:
12 |   validate:
13 |     name: "validate"
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Check out a copy of the repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Check whether the citation metadata from CITATION.cff is valid
20 |         uses: citation-file-format/cffconvert-github-action@2.0.0
21 |         with:
22 |           args: "--validate"
23 | 


--------------------------------------------------------------------------------
/doc/whats_new/changelog_legend.inc:
--------------------------------------------------------------------------------
 1 | Legend for changelogs
 2 | ---------------------
 3 | 
 4 | - |MajorFeature|: something big that you couldn't do before.
 5 | - |Feature|: something that you couldn't do before.
 6 | - |Efficiency|: an existing feature now may not require as much computation or
 7 |   memory.
 8 | - |Enhancement|: a miscellaneous minor improvement.
 9 | - |Fix|: something that previously didn't work as documentated -- or according
10 |   to reasonable expectations -- should now work.
11 | - |API|: you will need to change your code to have the same effect in the
12 |   future; or a feature will be removed in the future.
13 | 


--------------------------------------------------------------------------------
/treeple/_build_utils/gcc_build_bitness.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | """ Detect bitness (32 or 64) of Mingw-w64 gcc build target on Windows.
 3 | """
 4 | 
 5 | import re
 6 | from subprocess import run
 7 | 
 8 | 
 9 | def main():
10 |     res = run(["gcc", "-v"], check=True, text=True, capture_output=True)
11 |     target = re.search(r"^Target: (.*)$", res.stderr, flags=re.M).groups()[0]
12 |     if target.startswith("i686"):
13 |         print("32")
14 |     elif target.startswith("x86_64"):
15 |         print("64")
16 |     else:
17 |         raise RuntimeError("Could not detect Mingw-w64 bitness")
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/treeple/tree/honesty/meson.build:
--------------------------------------------------------------------------------
 1 | tree_extension_metadata = {
 2 |   '_honest_prune':
 3 |     {'sources': ['_honest_prune.pyx'],
 4 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 5 | }
 6 | 
 7 | foreach ext_name, ext_dict : tree_extension_metadata
 8 |   py.extension_module(
 9 |     ext_name,
10 |     ext_dict.get('sources'),
11 |     dependencies: [np_dep],
12 |     override_options : ext_dict.get('override_options', []),
13 |     c_args: c_args,
14 |     cython_args: cython_c_args,
15 |     subdir: 'treeple/tree/honesty',
16 |     install: true,
17 |   )
18 | endforeach
19 | 
20 | 
21 | py.install_sources(
22 |   subdir: 'treeple/tree/honesty'   # Folder relative to site-packages to install to
23 | )
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'Feature request'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/treeple/tree/manifold/meson.build:
--------------------------------------------------------------------------------
 1 | tree_extension_metadata = {
 2 |   '_morf_splitter':
 3 |     {'sources': ['_morf_splitter.pyx'],
 4 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 5 | }
 6 | 
 7 | foreach ext_name, ext_dict : tree_extension_metadata
 8 |   py.extension_module(
 9 |     ext_name,
10 |     ext_dict.get('sources'),
11 |     dependencies: [np_dep],
12 |     override_options : ext_dict.get('override_options', []),
13 |     c_args: c_args,
14 |     cython_args: cython_c_args,
15 |     subdir: 'treeple/tree/manifold',
16 |     install: true,
17 |   )
18 | endforeach
19 | 
20 | 
21 | py.install_sources(
22 |   subdir: 'treeple/tree/manifold'   # Folder relative to site-packages to install to
23 | )
24 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 100
 3 | 
 4 | ignore =
 5 |     # these rules don't play well with black
 6 |     # whitespace before ':'
 7 |     E203
 8 |     # line break before binary operator
 9 |     W503
10 |     E241,E305,W504,W605,E731
11 |     E402
12 | 
13 | exclude =
14 |     .git
15 |     .github
16 |     .venv
17 |     .mypy_cache
18 |     .pytest_cache
19 |     .circleci
20 |     paper
21 |     doc/_build
22 |     doc/generated
23 |     doc/auto_examples
24 |     validation
25 |     build
26 |     build-install 
27 |     dist
28 |     treeple/_lib/
29 |     .asv
30 |     env
31 | 
32 | per-file-ignores =
33 |     # __init__.py files are allowed to have unused imports
34 |     */__init__.py:F401
35 |     */**/__init__.py:F401
36 | 


--------------------------------------------------------------------------------
/doc/whats_new.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. _whats_new:
 4 | 
 5 | .. include:: whats_new/_contributors.rst
 6 | 
 7 | Release History
 8 | ===============
 9 | 
10 | Release notes for all treeple releases are linked in this page.
11 | 
12 | **Tip:** `Subscribe to treeple releases <https://libraries.io/pypi/treeple>`__
13 | on libraries.io to be notified when new versions are released.
14 | 
15 | .. toctree::
16 |     :maxdepth: 1
17 | 
18 |     Version 0.1 <whats_new/v0.1.rst>
19 |     Version 0.2 <whats_new/v0.2.rst>
20 |     Version 0.3 <whats_new/v0.3.rst>
21 |     Version 0.4 <whats_new/v0.4.rst>
22 |     Version 0.5 <whats_new/v0.5.rst>
23 |     Version 0.6 <whats_new/v0.6.rst>
24 |     Version 0.7 <whats_new/v0.7.rst>
25 |     Version 0.8 <whats_new/v0.8.rst>
26 |     Version 0.9 <whats_new/v0.9.rst>
27 |     Version 0.10 (Unreleased) <whats_new/v0.10.rst>
28 | 
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Distribution / packaging
 2 | .Python
 3 | dist/
 4 | *.egg*
 5 | build
 6 | build-install/
 7 | coverage
 8 | *.xml
 9 | .venv
10 | .pymon
11 | .coverage.*
12 | 
13 | commit.txt
14 | treeple/_lib/sklearn/
15 | 
16 | *.png
17 | _data
18 | 
19 | # Sphinx documentation
20 | doc/_build/
21 | doc/generated/
22 | doc/auto_examples/
23 | doc/auto_tutorials/
24 | doc/modules/generated/
25 | doc/sphinxext/cachedir
26 | pip-log.txt
27 | .coverage
28 | tags
29 | doc/coverages
30 | doc/samples
31 | cover
32 | examples/*.jpg
33 | examples/**/*.jpg
34 | 
35 | env/
36 | html/
37 | results/
38 | scikit-learn/
39 | benchmarks/cache/
40 | 
41 | # Pycharm
42 | .idea/
43 | 
44 | *.pyc
45 | 
46 | *.so 
47 | *.cpp 
48 | *.c
49 | 
50 | .cache
51 | .pytest_cache
52 | .ipynb_checkpoints
53 | .DS_Store
54 | .vscode/
55 | 
56 | __pycache__
57 | 
58 | # Profiling
59 | profiling/
60 | *.prof


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=mne_bids
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [adam2392, PSSF23, sampan501, SUKI-O]  # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon:  # Replace with a single Patreon username
 5 | open_collective:  # Replace with a single Open Collective username
 6 | ko_fi:  # Replace with a single Ko-fi username
 7 | tidelift:  # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge:  # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay:  # Replace with a single Liberapay username
10 | issuehunt:  # Replace with a single IssueHunt username
11 | lfx_crowdfunding:  # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar:  # Replace with a single Polar username
13 | buy_me_a_coffee: adam2392  # Replace with a single Buy Me a Coffee username
14 | custom:  # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
15 | 


--------------------------------------------------------------------------------
/doc/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {%- extends "pydata_sphinx_theme/layout.html" %}
 2 | 
 3 | {% block fonts %}
 4 |     <!-- add `style` or `link` tags with your CSS `@font-face` declarations here -->
 5 |     <!-- ... and a `style` tag with setting `font-family` in `body` and `.header-style` -->
 6 |     <!-- ... and optionally preload the `woff2` for snappier page loads -->
 7 |     <!-- or add a `style` tag with a font fallback chain with good cross-platform coverage -->
 8 |     <style>
 9 |         body,.header-style {font-family: 'Source Sans Pro', sans-serif;}
10 |         code,kbd,pre,samp {font-family: 'Source Code Pro', monospace;}
11 |     </style>
12 | {% endblock %}
13 | 
14 | {% block extrahead %}
15 |     <link rel="canonical" href="https://neurodata.github.io/treeple/stable/index.html" />
16 |     <script type="text/javascript" src="{{ pathto('_static/copybutton.js', 1) }}"></script>
17 |     <script type="text/javascript" src="{{ pathto('_static/scrollfix.js', 1) }}"></script>
18 | {{ super() }}
19 | {% endblock %}
20 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for contributing a pull request! Please ensure you have taken a look at
 3 | the contribution guidelines from scikit-learn: https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
 4 | -->
 5 | 
 6 | #### Reference Issues/PRs
 7 | <!--
 8 | Example: Fixes #1234. See also #3456.
 9 | Please use keywords (e.g., Fixes) to create link to the issues or pull requests
10 | you resolved, so that they will automatically be closed when your pull request
11 | is merged. See https://github.com/blog/1506-closing-issues-via-pull-requests
12 | -->
13 | 
14 | 
15 | #### What does this implement/fix? Explain your changes.
16 | 
17 | 
18 | #### Any other comments?
19 | 
20 | 
21 | <!--
22 | Please be aware that we are a loose team of volunteers so patience is
23 | necessary; assistance handling other issues is very welcome. We value
24 | all user contributions, no matter how minor they are.
25 | 
26 | See https://github.com/neurodata/treeple/blob/main/CONTRIBUTING.md for more
27 | information on contributing.
28 | 
29 | Thanks for contributing!
30 | -->
31 | 


--------------------------------------------------------------------------------
/doc/_static/style.css:
--------------------------------------------------------------------------------
 1 | a[class^="sphx-glr-backref-module-scikit_tree"] {
 2 |   /* make all MNE-BIDS backrefs bold */
 3 |   font-weight: 800;
 4 | }
 5 | 
 6 | /* Disable hyphenation in API reference table for Webkit-based browsers
 7 |    to work around alignment bug */
 8 | #api-documentation table p {
 9 |   -webkit-hyphens: none;
10 | }
11 | 
12 | /* Hide version number from top-left location in the navbar */
13 | .navbar-version {
14 |   display: none;
15 | }
16 | 
17 | html {
18 |   font-size: 16px;
19 | }
20 | 
21 | h1 {
22 |   font-size: 1.6rem;
23 | }
24 | 
25 | h2 {
26 |   font-size: 1.3rem;
27 | }
28 | 
29 | h3 {
30 |   font-size: 1rem;
31 |   font-weight: bold;
32 | }
33 | 
34 | h4 {
35 |   font-size: 1rem;
36 | }
37 | 
38 | .footer {
39 |   margin-top: 3em;
40 |   padding-top: 1em;
41 | }
42 | 
43 | /* Links in the Note boxes */
44 | .note a {
45 |   color: blue;
46 |   text-decoration: underline;
47 | }
48 | 
49 | .note a:hover {
50 |   color: blue;
51 |   font-weight: bold;
52 |   text-decoration: underline;
53 | }
54 | 
55 | /* Links in "Note" boxes */
56 | .alert-info a code span {
57 |   color: blue;
58 | }
59 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.10.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _current:
 7 | 
 8 | Version 0.10
 9 | ============
10 | 
11 | ** In Development **
12 | 
13 | Changelog
14 | ---------
15 | 
16 | - |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
17 |     ``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)
18 | - |Feature| Added a sparse implementation of `treeple.stats.forest.build_colemen_forest`
19 |     that uses the `scipy.sparse` module. By `Ryan Hausen`_ (:pr:`#317`)
20 | - |Feature| :class:`treeple.tree.HonestTreeClassifier` now has a ``honest_method`` parameter
21 |     that enables the user to turn on pruning of the tree, such that there are no
22 |     empty leaf predictions. This brings the model closer to the implementation in GRF in R.
23 |     By `Adam Li`_ (:pr:`#286`)
24 | 
25 | 
26 | Code and Documentation Contributors
27 | -----------------------------------
28 | 
29 | Thanks to everyone who has contributed to the maintenance and improvement of
30 | the project since version inception, including:
31 | 
32 | * `Adam Li`_
33 | * `Ryan Hausen`_
34 | 


--------------------------------------------------------------------------------
/.github/workflows/circle_artifacts.yml:
--------------------------------------------------------------------------------
 1 | name: CircleCI artifacts redirector
 2 | on: [status]
 3 | 
 4 | # Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this
 5 | # github actions workflow:
 6 | # https://docs.github.com/en/actions/security-guides/automatic-token-authentication
 7 | permissions: read-all
 8 | 
 9 | jobs:
10 |   circleci_artifacts_redirector_job:
11 |     runs-on: ubuntu-20.04
12 |     if: "github.repository == 'neurodata/treeple' && github.event.context == 'ci/circleci: build_docs'"
13 |     permissions:
14 |       statuses: write
15 |     name: Run CircleCI artifacts redirector
16 |     steps:
17 |       - name: GitHub Action step
18 |         uses: larsoner/circleci-artifacts-redirector-action@master
19 |         with:
20 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
21 |           api-token: ${{ secrets.CIRCLECI_TOKEN }}
22 |           artifact-path: 0/dev/index.html
23 |           circleci-jobs: build_docs
24 |           job-title: Check the rendered docs here!
25 | 
26 |       - name: Check the URL
27 |         if: github.event.status != 'pending'
28 |         run: |
29 |           curl --fail ${{ steps.step1.outputs.url }} | grep $GITHUB_SHA
30 | 


--------------------------------------------------------------------------------
/treeple/tree/_utils.pxd:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | 
 3 | import numpy as np
 4 | 
 5 | cimport numpy as cnp
 6 | 
 7 | cnp.import_array()
 8 | 
 9 | from .._lib.sklearn.tree._splitter cimport SplitRecord
10 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t
11 | 
12 | ctypedef fused vector_or_memview:
13 |     vector[intp_t]
14 |     intp_t[::1]
15 |     intp_t[:]
16 | 
17 | 
18 | cdef void fisher_yates_shuffle(
19 |     vector_or_memview indices_to_sample,
20 |     intp_t grid_size,
21 |     uint32_t* random_state,
22 | ) noexcept nogil
23 | 
24 | 
25 | cdef int rand_weighted_binary(
26 |     float64_t p0,
27 |     uint32_t* random_state
28 | ) noexcept nogil
29 | 
30 | cpdef unravel_index(
31 |     intp_t index,
32 |     cnp.ndarray[intp_t, ndim=1] shape
33 | )
34 | 
35 | cpdef ravel_multi_index(
36 |     intp_t[:] coords,
37 |     const intp_t[:] shape
38 | )
39 | 
40 | cdef void unravel_index_cython(
41 |     intp_t index,
42 |     const intp_t[:] shape,
43 |     vector_or_memview coords
44 | ) noexcept nogil
45 | 
46 | cdef intp_t ravel_multi_index_cython(
47 |     vector_or_memview coords,
48 |     const intp_t[:] shape
49 | ) noexcept nogil
50 | 


--------------------------------------------------------------------------------
/treeple/experimental/tests/test_simulate.py:
--------------------------------------------------------------------------------
 1 | from treeple.experimental.simulate import (
 2 |     simulate_helix,
 3 |     simulate_multivariate_gaussian,
 4 |     simulate_sphere,
 5 | )
 6 | 
 7 | 
 8 | # Test simulate_helix function
 9 | def test_simulate_helix():
10 |     P, X, Y, Z = simulate_helix(n_samples=1000)
11 |     assert len(P) == 1000
12 |     assert len(X) == 1000
13 |     assert len(Y) == 1000
14 |     assert len(Z) == 1000
15 | 
16 |     # Add more specific tests if necessary
17 | 
18 | 
19 | # Test simulate_sphere function
20 | def test_simulate_sphere():
21 |     latitude, longitude, Y1, Y2, Y3 = simulate_sphere(n_samples=1000)
22 |     assert len(latitude) == 1000
23 |     assert len(longitude) == 1000
24 |     assert len(Y1) == 1000
25 |     assert len(Y2) == 1000
26 |     assert len(Y3) == 1000
27 | 
28 |     # Add more specific tests if necessary
29 | 
30 | 
31 | # Test simulate_multivariate_gaussian function
32 | def test_simulate_multivariate_gaussian():
33 |     data, mean, cov = simulate_multivariate_gaussian(d=2, n_samples=1000)
34 |     assert data.shape == (1000, 2)
35 |     assert mean.shape == (2,)
36 |     assert cov.shape == (2, 2)
37 | 
38 |     # Add more specific tests if necessary
39 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.9.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_9:
 7 | 
 8 | Version 0.9
 9 | ===========
10 | 
11 | This release include a rename of the package to from ``scikit-tree`` to ``treeple``
12 | The users can replace the previous usage as follows:
13 | ``import sktree`` to ``import treeple``
14 | ``from sktree import tree`` to ``from treeple import tree``
15 | ``from sktree import ...`` to ``from treeple import ...``
16 | 
17 | Note that the previous version of the package will still be available under the name ``scikit-tree`` on PyPI.
18 | 
19 | Changelog
20 | ---------
21 | 
22 | - |API| Rename the package to ``treeple``. By `SUKI-O`_ (:pr:`#292`)
23 | - |Fix| Fixed a bug in the predict_proba function of the :class:`treeple.HonestForestClassifier` where posteriors
24 |     estimated on empty leaf with ``ignore`` prior would result in ``np.nan``
25 |     values for all trees on that sample.
26 |     By `Haoyin Xu`_ (:pr:`#291`)
27 | 
28 | Code and Documentation Contributors
29 | -----------------------------------
30 | 
31 | Thanks to everyone who has contributed to the maintenance and improvement of
32 | the project since version inception, including:
33 | 
34 | * `Adam Li`_
35 | * `SUKI-O`_
36 | * `Haoyin Xu`_
37 | 


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/meson.build:
--------------------------------------------------------------------------------
 1 | tree_extension_metadata = {
 2 |   '_unsup_criterion':
 3 |     {'sources': ['_unsup_criterion.pyx'],
 4 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 5 |   '_unsup_splitter':
 6 |     {'sources': ['_unsup_splitter.pyx'],
 7 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 8 |   '_unsup_tree':
 9 |     {'sources': ['_unsup_tree.pyx'],
10 |      'override_options': ['cython_language=cpp', 'optimization=3']},
11 |   '_unsup_oblique_splitter':
12 |     {'sources': ['_unsup_oblique_splitter.pyx'],
13 |      'override_options': ['cython_language=cpp', 'optimization=3']},
14 |   '_unsup_oblique_tree':
15 |     {'sources': ['_unsup_oblique_tree.pyx'],
16 |      'override_options': ['cython_language=cpp', 'optimization=3']},
17 | }
18 | 
19 | foreach ext_name, ext_dict : tree_extension_metadata
20 |   py.extension_module(
21 |     ext_name,
22 |     ext_dict.get('sources'),
23 |     dependencies: [np_dep],
24 |     override_options : ext_dict.get('override_options', []),
25 |     c_args: c_args,
26 |     cython_args: cython_c_args,
27 |     subdir: 'treeple/tree/unsupervised',
28 |     install: true,
29 |   )
30 | endforeach
31 | 
32 | 
33 | py.install_sources(
34 |   subdir: 'treeple/tree/unsupervised'   # Folder relative to site-packages to install to
35 | )
36 | 


--------------------------------------------------------------------------------
/treeple/tree/__init__.py:
--------------------------------------------------------------------------------
 1 | from .._lib.sklearn.tree import (
 2 |     DecisionTreeClassifier,
 3 |     DecisionTreeRegressor,
 4 |     ExtraTreeClassifier,
 5 |     ExtraTreeRegressor,
 6 | )
 7 | from ._classes import (
 8 |     ExtraObliqueDecisionTreeClassifier,
 9 |     ExtraObliqueDecisionTreeRegressor,
10 |     ObliqueDecisionTreeClassifier,
11 |     ObliqueDecisionTreeRegressor,
12 |     PatchObliqueDecisionTreeClassifier,
13 |     PatchObliqueDecisionTreeRegressor,
14 |     UnsupervisedDecisionTree,
15 |     UnsupervisedObliqueDecisionTree,
16 | )
17 | from ._honest_tree import HonestTreeClassifier
18 | from ._multiview import MultiViewDecisionTreeClassifier
19 | from ._neighbors import compute_forest_similarity_matrix
20 | 
21 | __all__ = [
22 |     "ExtraObliqueDecisionTreeClassifier",
23 |     "ExtraObliqueDecisionTreeRegressor",
24 |     "compute_forest_similarity_matrix",
25 |     "UnsupervisedDecisionTree",
26 |     "UnsupervisedObliqueDecisionTree",
27 |     "ObliqueDecisionTreeClassifier",
28 |     "ObliqueDecisionTreeRegressor",
29 |     "PatchObliqueDecisionTreeClassifier",
30 |     "PatchObliqueDecisionTreeRegressor",
31 |     "HonestTreeClassifier",
32 |     "DecisionTreeClassifier",
33 |     "DecisionTreeRegressor",
34 |     "ExtraTreeClassifier",
35 |     "ExtraTreeRegressor",
36 |     "MultiViewDecisionTreeClassifier",
37 | ]
38 | 


--------------------------------------------------------------------------------
/.github/workflows/style.yml:
--------------------------------------------------------------------------------
 1 | name: "Style checks"
 2 | 
 3 | concurrency:
 4 |   group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }}
 5 |   cancel-in-progress: true
 6 | 
 7 | on:
 8 |   pull_request:
 9 |     paths:
10 |       - "**.py"
11 |       - "**.pxd"
12 |       - "**.pyx"
13 |   push:
14 |     branches: [main]
15 |     paths:
16 |       - "**.py"
17 |     tags:
18 |       - "v*.*.*"
19 |   workflow_dispatch:
20 | 
21 | permissions:
22 |   contents: read  # to fetch code (actions/checkout)
23 | 
24 | jobs:
25 |   style:
26 |     name: Formatting, lint, style, and type-checks
27 |     timeout-minutes: 10
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - name: Checkout repository
31 |         uses: actions/checkout@v4
32 |       - name: Setup Python 3.11
33 |         uses: actions/setup-python@v5
34 |         with:
35 |           python-version: "3.11"
36 |           architecture: "x64"
37 | 
38 |       - name: Install packages for Ubuntu
39 |         run: |
40 |           sudo apt-get update
41 |           sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev
42 | 
43 |       - name: Install dependencies
44 |         run: |
45 |           pip install --upgrade pip
46 |           pip install -r style_requirements.txt
47 | 
48 |       # check formatting of the code style
49 |       - name: Check code formatting
50 |         run: make pre-commit
51 | 


--------------------------------------------------------------------------------
/doc/_static/versions.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "name": "0.10",
 4 |         "version": "dev",
 5 |         "url": "https://docs.neurodata.io/treeple/dev/"
 6 |     },
 7 |     {
 8 |         "name": "0.9",
 9 |         "version": "0.9",
10 |         "url": "https://docs.neurodata.io/treeple/v0.9/"
11 |     },
12 |     {
13 |         "name": "0.8",
14 |         "version": "0.8",
15 |         "url": "https://docs.neurodata.io/treeple/v0.8/"
16 |     },
17 |     {
18 |         "name": "0.7",
19 |         "version": "0.7",
20 |         "url": "https://docs.neurodata.io/treeple/v0.7/"
21 |     },
22 |     {
23 |         "name": "0.6",
24 |         "version": "0.6",
25 |         "url": "https://docs.neurodata.io/treeple/v0.6/"
26 |     },
27 |     {
28 |         "name": "0.5",
29 |         "version": "0.5",
30 |         "url": "https://docs.neurodata.io/treeple/v0.5/"
31 |     },
32 |     {
33 |         "name": "0.4",
34 |         "version": "0.4",
35 |         "url": "https://docs.neurodata.io/treeple/v0.4/"
36 |     },
37 |     {
38 |         "name": "0.3",
39 |         "version": "0.3",
40 |         "url": "https://docs.neurodata.io/treeple/v0.3/"
41 |     },
42 |     {
43 |         "name": "0.2",
44 |         "version": "0.2",
45 |         "url": "https://docs.neurodata.io/treeple/v0.2/"
46 |     },
47 |     {
48 |         "name": "0.1",
49 |         "version": "0.1",
50 |         "url": "https://docs.neurodata.io/treeple/v0.1/"
51 |     }
52 | ]
53 | 


--------------------------------------------------------------------------------
/benchmarks/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import balanced_accuracy_score, r2_score
 3 | 
 4 | 
 5 | def neg_mean_inertia(X, labels, centers):
 6 |     return -(np.asarray(X - centers[labels]) ** 2).sum(axis=1).mean()
 7 | 
 8 | 
 9 | def make_gen_classif_scorers(caller):
10 |     caller.train_scorer = balanced_accuracy_score
11 |     caller.test_scorer = balanced_accuracy_score
12 | 
13 | 
14 | def make_gen_reg_scorers(caller):
15 |     caller.test_scorer = r2_score
16 |     caller.train_scorer = r2_score
17 | 
18 | 
19 | def neg_mean_data_error(X, U, V):
20 |     return -np.sqrt(((X - U.dot(V)) ** 2).mean())
21 | 
22 | 
23 | def make_dict_learning_scorers(caller):
24 |     caller.train_scorer = lambda _, __: (
25 |         neg_mean_data_error(
26 |             caller.X, caller.estimator.transform(caller.X), caller.estimator.components_
27 |         )
28 |     )
29 |     caller.test_scorer = lambda _, __: (
30 |         neg_mean_data_error(
31 |             caller.X_val,
32 |             caller.estimator.transform(caller.X_val),
33 |             caller.estimator.components_,
34 |         )
35 |     )
36 | 
37 | 
38 | def explained_variance_ratio(Xt, X):
39 |     return np.var(Xt, axis=0).sum() / np.var(X, axis=0).sum()
40 | 
41 | 
42 | def make_pca_scorers(caller):
43 |     caller.train_scorer = lambda _, __: caller.estimator.explained_variance_ratio_.sum()
44 |     caller.test_scorer = lambda _, __: (
45 |         explained_variance_ratio(caller.estimator.transform(caller.X_val), caller.X_val)
46 |     )
47 | 


--------------------------------------------------------------------------------
/treeple/tree/meson.build:
--------------------------------------------------------------------------------
 1 | tree_extension_metadata = {
 2 |   '_sklearn_splitter':
 3 |     {'sources': ['_sklearn_splitter.pyx'],
 4 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 5 |   '_oblique_splitter':
 6 |     {'sources': ['_oblique_splitter.pyx'],
 7 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 8 |   '_oblique_tree':
 9 |     {'sources': ['_oblique_tree.pyx'],
10 |      'override_options': ['cython_language=cpp', 'optimization=3']},
11 |   '_utils':
12 |     {'sources': ['_utils.pyx'],
13 |      'override_options': ['cython_language=cpp', 'optimization=3']},
14 |   '_marginal':
15 |     {'sources': ['_marginal.pyx'],
16 |      'override_options': ['cython_language=cpp', 'optimization=3']},
17 | }
18 | 
19 | foreach ext_name, ext_dict : tree_extension_metadata
20 |   py.extension_module(
21 |     ext_name,
22 |     ext_dict.get('sources'),
23 |     dependencies: [np_dep],
24 |     override_options : ext_dict.get('override_options', []),
25 |     c_args: c_args,
26 |     cython_args: cython_c_args,
27 |     subdir: 'treeple/tree',
28 |     install: true,
29 |   )
30 | endforeach
31 | 
32 | python_sources = [
33 |   '__init__.py',
34 |   '_classes.py',
35 |   '_multiview.py',
36 |   '_neighbors.py',
37 |   '_honest_tree.py',
38 |   '_marginalize.py',
39 | ]
40 | 
41 | py.install_sources(
42 |   python_sources,
43 |   subdir: 'treeple/tree'   # Folder relative to site-packages to install to
44 | )
45 | 
46 | subdir('tests')
47 | subdir('unsupervised')
48 | subdir('manifold')
49 | subdir('honesty')


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_oblique_tree.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language = c++
 2 | 
 3 | # Authors: Adam Li <adam2392@gmail.com>
 4 | #
 5 | # License: BSD 3 clause
 6 | 
 7 | # See _unsup_oblique_tree.pyx for details.
 8 | 
 9 | import numpy as np
10 | 
11 | cimport numpy as cnp
12 | from libcpp.vector cimport vector
13 | 
14 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
15 | from ..._lib.sklearn.tree._tree cimport Node
16 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
17 | from .._oblique_splitter cimport ObliqueSplitRecord
18 | from ._unsup_tree cimport UnsupervisedTree
19 | 
20 | 
21 | cdef class UnsupervisedObliqueTree(UnsupervisedTree):
22 |     cdef vector[vector[float32_t]] proj_vec_weights  # (capacity, n_features) array of projection vectors
23 |     cdef vector[vector[intp_t]] proj_vec_indices   # (capacity, n_features) array of projection vectors
24 | 
25 |     # overridden methods
26 |     cdef int _resize_c(
27 |         self,
28 |         intp_t capacity=*
29 |     ) except -1 nogil
30 |     cdef int _set_split_node(
31 |         self,
32 |         SplitRecord* split_node,
33 |         Node *node,
34 |         intp_t node_id,
35 |     )  except -1 nogil
36 |     cdef float32_t _compute_feature(
37 |         self,
38 |         const float32_t[:, :] X_ndarray,
39 |         intp_t sample_index,
40 |         Node *node
41 |     ) noexcept nogil
42 |     cdef void _compute_feature_importances(
43 |         self,
44 |         float64_t[:] importances,
45 |         Node* node
46 |     ) noexcept nogil
47 | 
48 |     cpdef cnp.ndarray get_projection_matrix(self)
49 | 


--------------------------------------------------------------------------------
/treeple/tree/_oblique_tree.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language = c++
 2 | 
 3 | # Authors: Adam Li <adam2392@gmail.com>
 4 | #          Chester Huynh <chester.huynh924@gmail.com>
 5 | #          Parth Vora <pvora4@jhu.edu>
 6 | #
 7 | # License: BSD 3 clause
 8 | 
 9 | # See _oblique_tree.pyx for details.
10 | 
11 | import numpy as np
12 | 
13 | cimport numpy as cnp
14 | from libcpp.vector cimport vector
15 | 
16 | from .._lib.sklearn.tree._splitter cimport SplitRecord
17 | from .._lib.sklearn.tree._tree cimport Node, Tree, TreeBuilder
18 | from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
19 | from ._oblique_splitter cimport ObliqueSplitRecord
20 | 
21 | 
22 | cdef class ObliqueTree(Tree):
23 |     cdef vector[vector[float32_t]] proj_vec_weights  # (capacity, n_features) array of projection vectors
24 |     cdef vector[vector[intp_t]] proj_vec_indices   # (capacity, n_features) array of projection vectors
25 | 
26 |     # overridden methods
27 |     cdef int _resize_c(
28 |         self,
29 |         intp_t capacity=*
30 |     ) except -1 nogil
31 |     cdef int _set_split_node(
32 |         self,
33 |         SplitRecord* split_node,
34 |         Node *node,
35 |         intp_t node_id
36 |     )  except -1 nogil
37 |     cdef float32_t _compute_feature(
38 |         self,
39 |         const float32_t[:, :] X_ndarray,
40 |         intp_t sample_index,
41 |         Node *node
42 |     ) noexcept nogil
43 |     cdef void _compute_feature_importances(
44 |         self,
45 |         float64_t[:] importances,
46 |         Node* node
47 |     ) noexcept nogil
48 | 
49 |     cpdef cnp.ndarray get_projection_matrix(self)
50 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # YAML 1.2
 2 | ---
 3 | # Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/)
 4 | cff-version: 1.2.0
 5 | title: "treeple: Modern decision-trees compatible with scikit-learn in Python."
 6 | abstract: "treeple is a scikit-learn compatible API for building state-of-the-art decision trees. These include unsupervised trees, oblique trees, uncertainty trees, quantile trees and causal trees."
 7 | authors:
 8 |   - given-names: Adam
 9 |     family-names: Li
10 |     affiliation: "Department of Computer Science, Columbia University, New York, NY, USA"
11 |     orcid: "https://orcid.org/0000-0001-8421-365X"
12 |   - given-names: Sambit
13 |     family-names: Panda
14 |     affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA"
15 |     orcid: "https://orcid.org/0000-0001-8455-4243"
16 |   - given-names: Haoyin
17 |     family-names: Xu
18 |     affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA"
19 |     orcid: "https://orcid.org/0000-0001-8235-4950"
20 |   - given-names: Itsuki
21 |     family-names: Ogihara
22 |     affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA"
23 | type: software
24 | repository-code: "https://github.com/neurodata/treeple"
25 | license: 'PolyForm-Noncommercial-1.0.0'
26 | keywords:
27 |   - random forest
28 |   - oblique trees
29 |   - honest forests
30 |   - statisical learning
31 |   - machine learning
32 | message: >-
33 |   Please cite this software using the metadata from
34 |   'preferred-citation' in the CITATION.cff file.
35 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.5.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_5:
 7 | 
 8 | Version 0.5
 9 | ===========
10 | 
11 | This release includes a number of enhancements and bug fixes, mainly
12 | to the :class:`treeple.tree.MultiViewDecisionTreeClassifier`. Most notably,
13 | the ``max_features`` argument now supports an array of values, which
14 | applies a different ``max_features`` argument per feature view.
15 | 
16 | Changelog
17 | ---------
18 | 
19 | - |Enhancement| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
20 |   rounds up the number of features to split on to the nearest integer when
21 |   applying ``max_features`` to each feature view, by `Adam Li`_ (:pr:`#183`).
22 | - |Feature| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
23 |   supports an array passed in for ``max_features``, which applies a different
24 |   max_features argument per view, by `Adam Li`_ (:pr:`#183`).
25 | - |Fix| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now correctly
26 |   handles the case where there is one feature view that is exhausted, and
27 |   another that is not for ``apply_max_features_per_feature_set = False``,
28 |   by `Adam Li`_ (:pr:`#183`).
29 | - |Fix| ``treeple.stats.FeatureImportanceForestClassifier`` now correctly passes
30 |   metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`).
31 | 
32 | Code and Documentation Contributors
33 | -----------------------------------
34 | 
35 | Thanks to everyone who has contributed to the maintenance and improvement of
36 | the project since version inception, including:
37 | 
38 | * `Adam Li`_
39 | 
40 | 


--------------------------------------------------------------------------------
/doc/whats_new/_contributors.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ..
 3 |     This file maps contributor names to their URLs. It should mostly be used
 4 |     for core contributors, and occasionally for contributors who do not want
 5 |     their github page to be their URL target. Historically it was used to
 6 |     hyperlink all contributors' names, and ``:user:`` should now be preferred.
 7 |     It also defines other ReST substitutions.
 8 | 
 9 | .. role:: raw-html(raw)
10 |    :format: html
11 | 
12 | .. role:: raw-latex(raw)
13 |    :format: latex
14 | 
15 | .. |MajorFeature| replace:: :raw-html:`<span class="badge badge-success">Major Feature</span>` :raw-latex:`{\small\sc [Major Feature]}`
16 | .. |Feature| replace:: :raw-html:`<span class="badge badge-success">Feature</span>` :raw-latex:`{\small\sc [Feature]}`
17 | .. |Efficiency| replace:: :raw-html:`<span class="badge badge-info">Efficiency</span>` :raw-latex:`{\small\sc [Efficiency]}`
18 | .. |Enhancement| replace:: :raw-html:`<span class="badge badge-info">Enhancement</span>` :raw-latex:`{\small\sc [Enhancement]}`
19 | .. |Fix| replace:: :raw-html:`<span class="badge badge-danger">Fix</span>` :raw-latex:`{\small\sc [Fix]}`
20 | .. |API| replace:: :raw-html:`<span class="badge badge-warning">API Change</span>` :raw-latex:`{\small\sc [API Change]}`
21 | 
22 | 
23 | .. _Adam Li: https://adam2392.github.io
24 | .. _Jong Shin: https://github.com/jshinm
25 | .. _Sambit Panda: https://sampan.me
26 | .. _SUKI-O : https://github.com/SUKI-O
27 | .. _Ronan Perry : https://rflperry.github.io/
28 | .. _Haoyin Xu : https://github.com/PSSF23
29 | .. _Yuxin Bai : https://github.com/YuxinB
30 | .. _Ryan Hausen : https://ryanhausen.github.io
31 | 


--------------------------------------------------------------------------------
/benchmarks/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // "regular": Bencharks are run on small to medium datasets. Each benchmark
 3 |     //            is run multiple times and averaged.
 4 |     // "fast": Benchmarks are run on small to medium datasets. Each benchmark
 5 |     //         is run only once. May provide unstable benchmarks.
 6 |     // "large_scale": Benchmarks are run on large datasets. Each benchmark is
 7 |     //                run multiple times and averaged. This profile is meant to
 8 |     //                benchmark scalability and will take hours on single core.
 9 |     // Can be overridden by environment variable SKLBENCH_PROFILE.
10 |     "profile": "regular",
11 | 
12 |     // List of values of n_jobs to use for estimators which accept this 
13 |     // parameter (-1 means all cores). An empty list means all values from 1 to
14 |     // the maximum number of available cores.
15 |     // Can be overridden by environment variable SKLBENCH_NJOBS.
16 |     "n_jobs_vals": [1],
17 | 
18 |     // If true, fitted estimators are saved in ./cache/estimators/<commit hash>
19 |     // Can be overridden by environment variable SKLBENCH_SAVE_ESTIMATORS.
20 |     "save_estimators": false,
21 | 
22 |     // Commit hash to compare estimator predictions with.
23 |     // If null, predictions are not compared.
24 |     // Can be overridden by environment variable SKLBENCH_BASE_COMMIT.
25 |     "base_commit": null,
26 | 
27 |     // If false, the predict (resp. transform) method of the estimators won't
28 |     // be benchmarked.
29 |     // Can be overridden by environment variables SKLBENCH_PREDICT and
30 |     // SKLBENCH_TRANSFORM.
31 |     "bench_predict": true,
32 |     "bench_transform": true
33 | }


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: "Release to PyPI"
 2 | 
 3 | concurrency:
 4 |   group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.type }}
 5 |   cancel-in-progress: true
 6 | 
 7 | on:
 8 |   release:
 9 |     types: [published]
10 |   workflow_run:
11 |     workflows: [Build_Wheels]
12 |     branches: [main]
13 |     types: [completed]  # This ensures it triggers only after the workflow completes
14 |   workflow_dispatch:
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   pypi:
21 |     runs-on: ubuntu-latest
22 |     if: github.event_name == 'release'
23 |     permissions:
24 |       id-token: write
25 |     steps:
26 |       - name: Get run ID of "Build_Wheels" workflow
27 |         id: get-run-id
28 |         run: |
29 |           OTHER_REPO="${{ github.repository }}"
30 |           WF_NAME="Build_Wheels"
31 |           RUN_ID=`gh run --repo ${OTHER_REPO} list --workflow ${WF_NAME} --json databaseId --jq .[0].databaseId`
32 |           echo "Detected latest run id of ${RUN_ID} for workflow ${WF_NAME}"
33 |           echo "run-id=${RUN_ID}" >> "$GITHUB_OUTPUT"
34 |         env:
35 |           GH_TOKEN: ${{ github.token }}
36 | 
37 |       - name: Download artifact from "Build_Wheels" workflow
38 |         uses: actions/download-artifact@v4
39 |         with:
40 |           name: dist  # Match name used in build_wheels.yml upload artifact step
41 |           path: dist
42 |           github-token: ${{ github.token }}
43 |           repository: ${{ github.repository }}
44 |           run-id: ${{ steps.get-run-id.outputs.run-id }}
45 | 
46 |       - name: Show downloaded files
47 |         run: ls -la
48 | 
49 |       - name: Publish to PyPI
50 |         uses: pypa/gh-action-pypi-publish@release/v1
51 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.3.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_3:
 7 | 
 8 | Version 0.3
 9 | ===========
10 | 
11 | This release includes a number of bug fixes and enhancements related to hypothesis testing with decision trees.
12 | Moreover, we have added an experimental multi-view decision tree / random forest, which considers multiple views
13 | of the data when building trees. The documentation page has also undergone an organizational overhaul
14 | making it easier for users to find examples related to specific use cases.
15 | 
16 | Changelog
17 | ---------
18 | - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`)
19 | - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`)
20 | - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`)
21 | - |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`)
22 | - |Fix| Fixed usage of ``feature_importances_`` property in ``HonestForestClassifier``, by `Adam Li`_ (:pr:`156`)
23 | - |Fix| Fixed ``HonestForestClassifier`` to allow decision-trees from sklearn, albeit with a limited API, by `Adam Li`_ (:pr:`158`)
24 | 
25 | Code and Documentation Contributors
26 | -----------------------------------
27 | 
28 | Thanks to everyone who has contributed to the maintenance and improvement of
29 | the project since version inception, including:
30 | 
31 | * `Adam Li`_
32 | * `Sambit Panda`_
33 | * `Yuxin Bai`_
34 | 


--------------------------------------------------------------------------------
/treeple/experimental/tests/test_mutual_info.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def nonlinear_gaussian_with_additive_noise():
 5 |     """Nonlinear no-noise function with additive Gaussian noise.
 6 | 
 7 |     See: https://github.com/BiuBiuBiLL/NPEET_LNC/issues/4
 8 |     """
 9 |     # first simulate multivariate Gaussian without noise
10 | 
11 |     # then add the noise
12 | 
13 |     # compute MI by computing the H(Y|X) and H(X)
14 |     # H(Y|X) = np.log(noise_std)
15 |     # H(X) = kNN K-L estimate with large # of samples
16 |     pass
17 | 
18 | 
19 | def main():
20 |     d1 = [1, 1, 0]
21 |     d2 = [1, 0, 1]
22 |     d3 = [0, 1, 1]
23 |     mat = [d1, d2, d3]
24 |     tmat = np.transpose(mat)
25 |     diag = [[3, 0, 0], [0, 1, 0], [0, 0, 1]]
26 |     # mean = np.array([0, 0, 0])
27 |     cov = np.dot(tmat, np.dot(diag, mat))
28 |     print("covariance matrix")
29 |     print(cov)
30 |     print(tmat)
31 | 
32 | 
33 | def test_mi():
34 |     d1 = [1, 1, 0]
35 |     d2 = [1, 0, 1]
36 |     d3 = [0, 1, 1]
37 |     mat = [d1, d2, d3]
38 |     tmat = np.transpose(mat)
39 |     diag = [[3, 0, 0], [0, 1, 0], [0, 0, 1]]
40 |     # mean = np.array([0, 0, 0])
41 |     cov = np.dot(tmat, np.dot(diag, mat))
42 |     print("covariance matrix")
43 |     print(cov)
44 |     trueent = -0.5 * (3 + np.log(8.0 * np.pi * np.pi * np.pi * np.linalg.det(cov)))
45 |     trueent += -0.5 * (1 + np.log(2.0 * np.pi * cov[2][2]))  # z sub
46 |     trueent += 0.5 * (
47 |         2
48 |         + np.log(
49 |             4.0 * np.pi * np.pi * np.linalg.det([[cov[0][0], cov[0][2]], [cov[2][0], cov[2][2]]])
50 |         )
51 |     )  # xz sub
52 |     trueent += 0.5 * (
53 |         2
54 |         + np.log(
55 |             4.0 * np.pi * np.pi * np.linalg.det([[cov[1][1], cov[1][2]], [cov[2][1], cov[2][2]]])
56 |         )
57 |     )  # yz sub
58 |     print("true CMI(x:y|x)", trueent / np.log(2))
59 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.8.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_8:
 7 | 
 8 | Version 0.8
 9 | ===========
10 | 
11 | This development fixes a major bug with (CO)MIGHT, where low sample sizes produce biased tree
12 | posteriors, which is fixed by stratifying the sampling of the dataset to ensure that each class
13 | is represented in the bootstrap sample. Additionally, the release includes a number of bug fixes
14 | and improvements to the codebase.
15 | 
16 | Changelog
17 | ---------
18 | 
19 | - |Fix| Previously missing-values in ``X`` input array for treeple estimators
20 |     did not raise an error, and silently ran, assuming the missing-values were
21 |     encoded as infinity value. This is now fixed, and the estimators will raise an
22 |     ValueError if missing-values are encountered in ``X`` input array.
23 |     By `Adam Li`_ (:pr:`#264`)
24 | - |Feature| Simulations in ``treeple.datasets.hyppo`` now throw a warning instead
25 |     of an error when the number of samples is less than the number of dimensions.
26 |     By `Sambit Panda`_ (:pr:`#279`)
27 | - |API| :class:`treeple.HonestForestClassifier` now has ``bootstrap=True`` as the default
28 |     argument. By `Adam Li`_ (:pr:`#274`)
29 | - |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated
30 |     MIGHT code. By `Adam Li`_ (:pr:`#274`)
31 | - |Fix| Fixed a bug in the ``treeple.HonestForestClassifier`` where posteriors
32 |     estimated on oob samples were biased when there was a low number of samples
33 |     due to imbalance in the classes when ``bootstrap=True``.
34 |     By `Adam Li`_ (:pr:`#283`)
35 | 
36 | Code and Documentation Contributors
37 | -----------------------------------
38 | 
39 | Thanks to everyone who has contributed to the maintenance and improvement of
40 | the project since version inception, including:
41 | 
42 | * `Adam Li`_
43 | * `Sambit Panda`_
44 | 


--------------------------------------------------------------------------------
/doc/sphinxext/doi_role.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 |     doilinks
 4 |     ~~~~~~~~
 5 |     Extension to add links to DOIs. With this extension you can use e.g.
 6 |     :doi:`10.1016/S0022-2836(05)80360-2` in your documents. This will
 7 |     create a link to a DOI resolver
 8 |     (``https://doi.org/10.1016/S0022-2836(05)80360-2``).
 9 |     The link caption will be the raw DOI.
10 |     You can also give an explicit caption, e.g.
11 |     :doi:`Basic local alignment search tool <10.1016/S0022-2836(05)80360-2>`.
12 | 
13 |     :copyright: Copyright 2015  Jon Lund Steffensen. Based on extlinks by
14 |         the Sphinx team.
15 |     :license: BSD.
16 | """
17 | 
18 | from docutils import nodes, utils
19 | from sphinx.util.nodes import split_explicit_title
20 | 
21 | 
22 | def reference_role(typ, rawtext, text, lineno, inliner, options={}, content=[]):
23 |     text = utils.unescape(text)
24 |     has_explicit_title, title, part = split_explicit_title(text)
25 |     if typ in ["arXiv", "arxiv"]:
26 |         full_url = "https://arxiv.org/abs/" + part
27 |         if not has_explicit_title:
28 |             title = "arXiv:" + part
29 |         pnode = nodes.reference(title, title, internal=False, refuri=full_url)
30 |         return [pnode], []
31 |     if typ in ["doi", "DOI"]:
32 |         full_url = "https://doi.org/" + part
33 |         if not has_explicit_title:
34 |             title = "DOI:" + part
35 |         pnode = nodes.reference(title, title, internal=False, refuri=full_url)
36 |         return [pnode], []
37 | 
38 | 
39 | def setup_link_role(app):
40 |     app.add_role("arxiv", reference_role, override=True)
41 |     app.add_role("arXiv", reference_role, override=True)
42 |     app.add_role("doi", reference_role, override=True)
43 |     app.add_role("DOI", reference_role, override=True)
44 | 
45 | 
46 | def setup(app):
47 |     app.connect("builder-inited", setup_link_role)
48 |     return {"version": "0.1", "parallel_read_safe": True}
49 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.4.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_4:
 7 | 
 8 | Version 0.4
 9 | ===========
10 | 
11 | This version patches some issues with the ``FeatureImportance*`` classes and also adds a feature to the
12 | `MultiViewDecisionTreeClassifier` class that allows one to scale the number of split candidates sampled per feature-set
13 | equally.
14 | 
15 | Changelog
16 | ---------
17 | 
18 | - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
19 | - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`)
20 | - |Fix| Fixes a bug where ``FeatureImportanceForest*`` was unable to be run when calling ``statistic`` with ``covariate_index`` defined for MI, AUC metrics, by `Adam Li`_ (:pr:`164`)
21 | - |Enhancement| Add :func:`treeple.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`170`)
22 | - |Enhancement| Multi-view trees now are able to scale the sampling of split candidates at the same rate per feature-set now, which means 'sqrt' would sample split candidates equal to the square root of each feature-set size, by `Adam Li`_ (:pr:`152`)
23 | - |FIX| Fixes bug in :class:`treeple.tree.MultiViewDecisionTreeClassifier` where the max_features argument applied over
24 |     more than two views with ``apply_max_features_per_set`` set to ``True`` results in an incorrect and oversampled
25 |     number of max_features in the views after the first two, by `Adam Li`_ (:pr:`172`)
26 | 
27 | Code and Documentation Contributors
28 | -----------------------------------
29 | 
30 | Thanks to everyone who has contributed to the maintenance and improvement of
31 | the project since version inception, including:
32 | 
33 | * `Adam Li`_
34 | 
35 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.2.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_2:
 7 | 
 8 | Version 0.2
 9 | ===========
10 | 
11 | This release is a major release, with many new features and improvements.
12 | For instance, we have added a new implementation of the extended isolation forest,
13 | enabled all decision trees to take advantage of ``partial_fit`` meaning trees have streaming
14 | capabilities. Moreover, we have added an analogous implementation of extra-trees for oblique-trees.
15 | Finally, this release includes a highly experimental feature for multivariate high-dimensional
16 | hypothesis testing using permutation forests and a feature importance testing forest.
17 | 
18 | Changelog
19 | ---------
20 | - |Efficiency| Upgraded build process to rely on Cython 3.0+, by `Adam Li`_  (:pr:`109`)
21 | - |Feature| Allow decision trees to take advantage of ``partial_fit`` and ``monotonic_cst`` when available, by `Adam Li`_  (:pr:`109`)
22 | - |Feature| Implementation of  ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor by `SUKI-O`_ (:pr:`75`)
23 | - |Efficiency| Around 1.5-2x speed improvement for unsupervised forests, by `Adam Li`_  (:pr:`114`)
24 | - |API| Allow ``sqrt`` and ``log2`` keywords to be used for ``min_samples_split`` parameter in unsupervised forests, by `Adam Li`_  (:pr:`114`)
25 | - |Feature| Implement extended isolation forest, by `Adam Li`_ (:pr:`101`)
26 | - |Feature| Implementation of StreamDecisionForest, by `Haoyin Xu`_ and `Adam Li`_ (:pr:`116`)
27 | - |Feature| Implementation of Permutation forests and a feature importance testing forest, by `Haoyin Xu`_, `Adam Li`_, `Sambit Panda`_ (:pr:`125`)
28 | 
29 | Code and Documentation Contributors
30 | -----------------------------------
31 | 
32 | Thanks to everyone who has contributed to the maintenance and improvement of
33 | the project since version inception, including:
34 | 
35 | * `Adam Li`_
36 | * `SUKI-O`_
37 | * `Haoyin Xu`_
38 | * `Sambit Panda`_
39 | 


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_splitter.pxd:
--------------------------------------------------------------------------------
 1 | from ..._lib.sklearn.tree._splitter cimport BaseSplitter, SplitRecord
 2 | from ..._lib.sklearn.tree._tree cimport ParentInfo
 3 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t
 4 | from ._unsup_criterion cimport UnsupervisedCriterion
 5 | 
 6 | 
 7 | cdef class UnsupervisedSplitter(BaseSplitter):
 8 |     """
 9 |     Notable changes wrt scikit-learn:
10 |     1. `weighted_n_node_samples` is used as a stopping criterion and just used to
11 |     keep count of the "number of samples (weighted)". All samples have a default weight
12 |     of '1'.
13 |     2. `X` array instead of `y` array is stored as the criterions are computed over the X
14 |     array.
15 |     3. The feature_values memoryview is a feature vector with shared memory among the splitter
16 |     and the criterion object. This enables the splitter to assign values to it within the
17 |     `node_split` function and then `criterion` automatically can compute relevant statistics
18 |     on the shared memoryview into the array.
19 |     """
20 | 
21 |     # XXX: requires BaseSplitter to not define "criterion"
22 |     cdef public UnsupervisedCriterion criterion         # criterion computer
23 |     cdef const float32_t[:, :] X                          # feature matrix
24 |     cdef intp_t n_total_samples                         # store the total number of samples
25 | 
26 |     # Initialization method for unsupervised splitters
27 |     cdef int init(
28 |         self,
29 |         const float32_t[:, :] X,
30 |         const float64_t[:] sample_weight
31 |     ) except -1
32 | 
33 |     # Overridden Methods from base class
34 |     cdef int node_reset(
35 |         self,
36 |         intp_t start,
37 |         intp_t end,
38 |         float64_t* weighted_n_node_samples
39 |     ) except -1 nogil
40 |     cdef int node_split(
41 |         self,
42 |         ParentInfo* parent,
43 |         SplitRecord* split,
44 |     ) except -1 nogil
45 |     cdef void node_value(
46 |         self,
47 |         float64_t* dest
48 |     ) noexcept nogil
49 |     cdef float64_t node_impurity(
50 |         self
51 |     ) noexcept nogil
52 | 


--------------------------------------------------------------------------------
/doc/sphinxext/allow_nan_estimators.py:
--------------------------------------------------------------------------------
 1 | from contextlib import suppress
 2 | 
 3 | from docutils import nodes
 4 | from docutils.parsers.rst import Directive
 5 | from sklearn.utils import all_estimators
 6 | from sklearn.utils._test_common.instance_generator import _construct_instances
 7 | from sklearn.utils._testing import SkipTest
 8 | 
 9 | 
10 | class AllowNanEstimators(Directive):
11 |     @staticmethod
12 |     def make_paragraph_for_estimator_type(estimator_type):
13 |         intro = nodes.list_item()
14 |         intro += nodes.strong(text="Estimators that allow NaN values for type ")
15 |         intro += nodes.literal(text=f"{estimator_type}")
16 |         intro += nodes.strong(text=":\n")
17 |         exists = False
18 |         lst = nodes.bullet_list()
19 |         for name, est_class in all_estimators(type_filter=estimator_type):
20 |             with suppress(SkipTest):
21 |                 est = _construct_instance(est_class)
22 | 
23 |             if est._get_tags().get("allow_nan"):
24 |                 module_name = ".".join(est_class.__module__.split(".")[:2])
25 |                 class_title = f"{est_class.__name__}"
26 |                 class_url = f"./generated/{module_name}.{class_title}.html"
27 |                 item = nodes.list_item()
28 |                 para = nodes.paragraph()
29 |                 para += nodes.reference(
30 |                     class_title, text=class_title, internal=False, refuri=class_url
31 |                 )
32 |                 exists = True
33 |                 item += para
34 |                 lst += item
35 |         intro += lst
36 |         return [intro] if exists else None
37 | 
38 |     def run(self):
39 |         lst = nodes.bullet_list()
40 |         for i in ["cluster", "regressor", "classifier", "transformer"]:
41 |             item = self.make_paragraph_for_estimator_type(i)
42 |             if item is not None:
43 |                 lst += item
44 |         return [lst]
45 | 
46 | 
47 | def setup(app):
48 |     app.add_directive("allow_nan_estimators", AllowNanEstimators)
49 | 
50 |     return {
51 |         "version": "0.1",
52 |         "parallel_read_safe": True,
53 |         "parallel_write_safe": True,
54 |     }
55 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.1.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_1:
 7 | 
 8 | Version 0.1
 9 | ===========
10 | 
11 | Changelog
12 | ---------
13 | - |Feature| Implementation of the two-means Unsupervised Random Forest, by `Adam Li`_ (:pr:`9`)
14 | - |Feature| Implementation of oblique Unsupervised Random Forest, by `Adam Li`_ (:pr:`11`)
15 | - |Feature| Implementation of manifold oblique Random Forest, by `Adam Li`_ (:pr:`21`)
16 | - |Feature| Implementation of fastBIC criterion for unsupervised tree models, by `Adam Li`_ and `Jong Shin`_ (:pr:`45`)
17 | - |Fix| Fix a bug in Patch oblique random forest that samples outside the data boundaries and adds a user guide, by `Adam Li`_ (:pr:`61`)
18 | - |Feature| MORF trees now can sample n-dimensional patches inside an n-dimensional structure sample and make any arbitrary axis discontinuous, by `Adam Li`_ (:pr:`63`)
19 | - |Feature| All tree types can compute similarity and dissimilarity matrices, by `Sambit Panda`_ and `Adam Li`_ (:pr:`64`)
20 | - |Feature| MORF trees now can normalize by feature weight per sample per feature column, by `Adam Li`_ (:pr:`67`)
21 | - |Feature| A general-kernel MORF is now implemented where users can pass in a kernel library, by `Adam Li`_ (:pr:`70`)
22 | - |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`)
23 | - |Feature| Implementation of HonestTreeClassifier, HonestForestClassifier, by `Sambit Panda`_, `Adam Li`_, `Ronan Perry`_ and `Haoyin Xu`_ (:pr:`57`)
24 | - |Feature| Implementation of (conditional) mutual information estimation via unsupervised tree models and added NearestNeighborsMetaEstimator by `Adam Li`_ (:pr:`83`)
25 | - |Feature| Add multi-output support to HonestTreeClassifier, HonestForestClassifier, by `Ronan Perry`_, `Haoyin Xu`_ and `Adam Li`_ (:pr:`86`)
26 | 
27 | Code and Documentation Contributors
28 | -----------------------------------
29 | 
30 | Thanks to everyone who has contributed to the maintenance and improvement of
31 | the project since version inception, including:
32 | 
33 | * `Adam Li`_
34 | * `Sambit Panda`_
35 | * `Ronan Perry`_
36 | * `Haoyin Xu`_
37 | 


--------------------------------------------------------------------------------
/treeple/tree/honesty/_honest_prune.pxd:
--------------------------------------------------------------------------------
 1 | from ..._lib.sklearn.tree._criterion cimport Criterion
 2 | from ..._lib.sklearn.tree._partitioner cimport shift_missing_values_to_left_if_required
 3 | from ..._lib.sklearn.tree._splitter cimport SplitRecord, Splitter
 4 | from ..._lib.sklearn.tree._tree cimport Node, ParentInfo, Tree
 5 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t
 6 | 
 7 | 
 8 | # for each node, keep track of the node index and the parent index
 9 | # within the tree's node array
10 | cdef struct PruningRecord:
11 |     intp_t node_idx
12 |     intp_t start
13 |     intp_t end
14 |     float64_t lower_bound
15 |     float64_t upper_bound
16 | 
17 | 
18 | # TODO: this may break the notion of feature importances, as we don't set the node's impurity
19 | # at the child nodes.
20 | cdef class HonestPruner(Splitter):
21 |     cdef Tree tree          # The tree to be pruned
22 |     cdef intp_t capacity    # The maximum number of nodes in the pruned tree
23 |     cdef intp_t pos         # The current position to split left/right children
24 |     cdef intp_t n_missing   # The number of missing values in the feature currently considered
25 |     cdef uint8_t missing_go_to_left
26 | 
27 |     # TODO: only supports sparse for now.
28 |     cdef const float32_t[:, :] X
29 | 
30 |     cdef int init(
31 |         self,
32 |         object X,
33 |         const float64_t[:, ::1] y,
34 |         const float64_t[:] sample_weight,
35 |         const uint8_t[::1] missing_values_in_feature_mask,
36 |     ) except -1
37 | 
38 |     # This function is not used, and should be disabled for pruners
39 |     cdef int node_split(
40 |         self,
41 |         ParentInfo* parent_record,
42 |         SplitRecord* split,
43 |     ) except -1 nogil
44 | 
45 |     cdef bint check_node_partition_conditions(
46 |         self,
47 |         SplitRecord* current_split,
48 |         float64_t lower_bound,
49 |         float64_t upper_bound
50 |     ) noexcept nogil
51 | 
52 |     cdef inline intp_t n_left_samples(
53 |         self
54 |     ) noexcept nogil
55 |     cdef inline intp_t n_right_samples(
56 |         self
57 |     ) noexcept nogil
58 | 
59 |     cdef int partition_samples(
60 |         self,
61 |         intp_t node_idx,
62 |     ) noexcept nogil
63 | 


--------------------------------------------------------------------------------
/benchmarks/ensemble_supervised.py:
--------------------------------------------------------------------------------
 1 | from treeple.ensemble import ObliqueRandomForestClassifier
 2 | 
 3 | from .common import Benchmark, Estimator, Predictor
 4 | from .datasets import (
 5 |     _20newsgroups_highdim_dataset,
 6 |     _20newsgroups_lowdim_dataset,
 7 |     _synth_classification_dataset,
 8 | )
 9 | from .utils import make_gen_classif_scorers
10 | 
11 | 
12 | class ObliqueRandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):
13 |     """
14 |     Benchmarks for RandomForestClassifier.
15 |     """
16 | 
17 |     param_names = ["representation", "n_jobs"]
18 |     params = (["dense", "sparse"], Benchmark.n_jobs_vals)
19 | 
20 |     def setup_cache(self):
21 |         super().setup_cache()
22 | 
23 |     def make_data(self, params):
24 |         representation, n_jobs = params
25 | 
26 |         if representation == "sparse":
27 |             data = _20newsgroups_highdim_dataset()
28 |         else:
29 |             data = _20newsgroups_lowdim_dataset()
30 | 
31 |         return data
32 | 
33 |     def make_estimator(self, params):
34 |         representation, n_jobs = params
35 | 
36 |         n_estimators = 500 if Benchmark.data_size == "large" else 100
37 | 
38 |         estimator = ObliqueRandomForestClassifier(
39 |             n_estimators=n_estimators,
40 |             min_samples_split=10,
41 |             max_features="log2",
42 |             n_jobs=n_jobs,
43 |             random_state=0,
44 |         )
45 | 
46 |         return estimator
47 | 
48 |     def make_scorers(self):
49 |         make_gen_classif_scorers(self)
50 | 
51 | 
52 | class ObliqueRandomForestClassifierBenchmarkSynth(Predictor, Estimator, Benchmark):
53 |     """
54 |     Benchmarks for Oblique RF Classifier using synthetic classification data.
55 |     """
56 | 
57 |     param_names = []
58 |     params = ()
59 | 
60 |     def setup_cache(self):
61 |         super().setup_cache()
62 | 
63 |     def make_data(self, params):
64 |         data = _synth_classification_dataset(n_samples=10000, n_features=100, n_classes=5)
65 | 
66 |         return data
67 | 
68 |     def make_estimator(self, params):
69 |         estimator = ObliqueRandomForestClassifier(max_leaf_nodes=15, random_state=0)
70 | 
71 |         return estimator
72 | 
73 |     def make_scorers(self):
74 |         make_gen_classif_scorers(self)
75 | 


--------------------------------------------------------------------------------
/treeple/tree/_neighbors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def compute_forest_similarity_matrix(forest, X):
 5 |     """Compute the similarity matrix of samples in X using a trained forest.
 6 | 
 7 |     As an intermediate calculation, the forest applies the dataset and gets
 8 |     the leaves for each sample. Then, the similarity matrix is computed by
 9 |     counting the number of times each pair of samples ends up in the same leaf.
10 | 
11 |     Parameters
12 |     ----------
13 |     forest : BaseForest or BaseDecisionTree
14 |         The fitted forest.
15 |     X : array-like of shape (n_samples, n_features)
16 |         The input data.
17 | 
18 |     Returns
19 |     -------
20 |     aff_matrix : array-like of shape (n_samples, n_samples)
21 |         The estimated distance matrix.
22 |     """
23 |     if hasattr(forest, "estimator_"):
24 |         # apply to the leaves
25 |         X_leaves = forest.apply(X)
26 | 
27 |         n_est = forest.n_estimators
28 |     else:
29 |         # apply to the leaves for a single tree
30 |         X_leaves = forest.apply(X)[:, np.newaxis]
31 |         n_est = 1
32 | 
33 |     aff_matrix = sum(np.equal.outer(X_leaves[:, i], X_leaves[:, i]) for i in range(n_est))
34 |     # normalize by the number of trees
35 |     aff_matrix = np.divide(aff_matrix, n_est)
36 |     return aff_matrix
37 | 
38 | 
39 | def _compute_distance_matrix(aff_matrix):
40 |     """Private function to compute distance matrix after `compute_similarity_matrix`."""
41 |     dists = 1.0 - aff_matrix
42 |     return dists
43 | 
44 | 
45 | # ported from https://github.com/neurodata/hyppo/blob/main/hyppo/independence/_utils.py
46 | class SimMatrixMixin:
47 |     """Mixin class to calculate similarity and dissimilarity matrices.
48 | 
49 |     This augments tree/forest models with the sklearn's nearest-neighbors API.
50 |     """
51 | 
52 |     def compute_similarity_matrix(self, X):
53 |         """
54 |         Compute the similarity matrix of samples in X.
55 | 
56 |         Parameters
57 |         ----------
58 |         X : array-like of shape (n_samples, n_features)
59 |             The input data.
60 | 
61 |         Returns
62 |         -------
63 |         sim_matrix : array-like of shape (n_samples, n_samples)
64 |             The similarity matrix among the samples.
65 |         """
66 |         return compute_forest_similarity_matrix(self, X)
67 | 


--------------------------------------------------------------------------------
/treeple/stats/tests/test_permuteforest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from numpy.testing import assert_array_equal
 4 | from sklearn import datasets
 5 | 
 6 | from treeple.stats import PermutationHonestForestClassifier
 7 | 
 8 | # load the iris dataset (n_samples, 4)
 9 | # and randomly permute it
10 | iris = datasets.load_iris()
11 | seed = 12345
12 | rng = np.random.default_rng(seed)
13 | 
14 | # remove third class
15 | iris_X = iris.data[iris.target != 2]
16 | iris_y = iris.target[iris.target != 2]
17 | 
18 | p = rng.permutation(iris_X.shape[0])
19 | iris_X = iris_X[p]
20 | iris_y = iris_y[p]
21 | 
22 | 
23 | def test_permutationforest_errors():
24 |     """Test permutation forest errors when training."""
25 |     n_samples = 10
26 |     est = PermutationHonestForestClassifier(n_estimators=10, random_state=0)
27 | 
28 |     # covariate index must be an iterable
29 |     with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
30 |         est.fit(iris_X[:n_samples], iris_y[:n_samples], covariate_index=0)
31 | 
32 |     # covariate index must be an iterable of ints
33 |     with pytest.raises(RuntimeError, match="Not all covariate_index"):
34 |         est.fit(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[0, 1.0])
35 | 
36 |     # covariate index must not have numbers greater than
37 |     with pytest.raises(ValueError, match="The length of the covariate index"):
38 |         est.fit(
39 |             iris_X[:n_samples],
40 |             iris_y[:n_samples],
41 |             covariate_index=np.arange(iris_X.shape[1] + 1, dtype=np.intp),
42 |         )
43 | 
44 | 
45 | @pytest.mark.parametrize("permute_per_tree", [True, False])
46 | def test_inbag_samples_different_across_forest(permute_per_tree):
47 |     """Test that inbag samples are different across trees."""
48 |     n_estimators = 10
49 |     est = PermutationHonestForestClassifier(
50 |         n_estimators=n_estimators, random_state=0, permute_per_tree=permute_per_tree
51 |     )
52 | 
53 |     X = iris_X
54 |     y = iris_y
55 |     est.fit(X, y)
56 | 
57 |     # covariate index when None is all the features
58 |     assert_array_equal(est.covariate_index_, np.arange(X.shape[1], dtype=np.intp))
59 | 
60 |     # inbag samples should be different across trees when permute_per_tree=True
61 |     permutation_samples_ = est.permutation_indices_
62 |     permutation_samples_ground = permutation_samples_[0]
63 |     assert not all(
64 |         np.array_equal(permutation_samples_ground, permutation_samples_[idx])
65 |         for idx in range(1, n_estimators)
66 |     )
67 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 24.8.0
 4 |     hooks:
 5 |       - id: black
 6 |         args: [--quiet]
 7 | 
 8 |   - repo: https://github.com/pycqa/isort
 9 |     rev: 5.13.2
10 |     hooks:
11 |       - id: isort
12 |         name: isort (python)
13 |       - id: isort
14 |         name: isort (cython)
15 |         types: [cython]
16 | 
17 |   - repo: https://github.com/MarcoGorelli/cython-lint
18 |     rev: v0.16.2
19 |     hooks:
20 |       - id: cython-lint
21 |       - id: double-quote-cython-strings
22 | 
23 |   # Ruff treeple
24 |   - repo: https://github.com/astral-sh/ruff-pre-commit
25 |     rev: v0.6.9
26 |     hooks:
27 |       - id: ruff
28 |         name: ruff treeple
29 |         args: ["--fix"]
30 |         files: ^treeple/
31 | 
32 |   # Ruff tutorials and examples
33 |   - repo: https://github.com/astral-sh/ruff-pre-commit
34 |     rev: v0.6.9
35 |     hooks:
36 |       - id: ruff
37 |         name: ruff tutorials and examples
38 |         # D103: missing docstring in public function
39 |         # D400: docstring first line must end with period
40 |         args: ["--ignore=D103,D400", "--fix"]
41 |         files: ^tutorials/|^examples/
42 | 
43 |   # Codespell
44 |   - repo: https://github.com/codespell-project/codespell
45 |     rev: v2.3.0
46 |     hooks:
47 |       - id: codespell
48 |         additional_dependencies:
49 |           - tomli
50 |         files: ^treeple/|^doc/|^examples/|^tutorials/
51 |         types_or: [python, bib, rst, inc]
52 | 
53 |   # yamllint
54 |   - repo: https://github.com/adrienverge/yamllint.git
55 |     rev: v1.35.1
56 |     hooks:
57 |       - id: yamllint
58 |         args: [--strict, -c, .yamllint.yml]
59 | 
60 |   # toml-sort
61 |   - repo: https://github.com/pappasam/toml-sort
62 |     rev: v0.23.1
63 |     hooks:
64 |       - id: toml-sort
65 |         files: ^pyproject\.toml$
66 |         args: ['-i']
67 | 
68 |   # mypy
69 |   - repo: https://github.com/pre-commit/mirrors-mypy
70 |     rev: v1.11.2
71 |     hooks:
72 |       - id: mypy
73 |         # Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former
74 |         exclude: ^(benchmarks_nonasv|examples|benchmarks|.spin)/.*$
75 |         additional_dependencies: ["numpy==1.26.2"]
76 | 
77 |   # rstcheck
78 |   - repo: https://github.com/rstcheck/rstcheck.git
79 |     rev: v6.2.4
80 |     hooks:
81 |       - id: rstcheck
82 |         additional_dependencies:
83 |           - tomli
84 |         files: ^(?!doc/use\.rst$).*\.(rst|inc)$
85 | 
86 | ci:
87 |   autofix_prs: true
88 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.6.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_6:
 7 | 
 8 | Version 0.6
 9 | ===========
10 | 
11 | This release includes an enhancement mainly in the MultiViewDecisionTreeClassifier
12 | and HonestForestClassifier, and a new generative model for the make_trunk_classification.
13 | 
14 | Changelog
15 | ---------
16 | 
17 | - |Enhancement| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
18 |   rounds up the number of features to split on to the nearest integer when
19 |   applying ``max_features`` to each feature view, by `Adam Li`_ (:pr:`#183`).
20 | - |Feature| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now
21 |   supports an array passed in for ``max_features``, which applies a different
22 |   max_features argument per view, by `Adam Li`_ (:pr:`#183`).
23 | - |Fix| :class:`treeple.tree.MultiViewDecisionTreeClassifier` now correctly
24 |   handles the case where there is one feature view that is exhausted, and
25 |   another that is not for ``apply_max_features_per_feature_set = False``,
26 |   by `Adam Li`_ (:pr:`#183`).
27 | - |Fix| ``treeple.stats.FeatureImportanceForestClassifier`` now correctly passes
28 |   metric kwargs to the null distribution function, by `Adam Li`_ (:pr:`#183`).
29 | - |Enhancement| :func:`treeple.datasets.make_trunk_classification` now
30 |   has a generative model based on Trunk and banded covariance, :func:`treeple.datasets.approximate_clf_mutual_information` and
31 |   :func:`treeple.datasets.approximate_clf_mutual_information_with_monte_carlo` to
32 |   approximate mutual information either numerically or via Monte-Carlo, by `Adam Li`_ and `Haoyin Xu`_ (:pr:`#199`).
33 | - |Enhancement| :class:`treeple.HonestForestClassifier` now has a fitted
34 |   property ``oob_samples_``, which reproduces the sample indices per tree that is out
35 |   of bag, by `Adam Li`_ (:pr:`#200`).
36 | - |Enhancement| :class:`treeple.HonestForestClassifier` will allow one to bootstrap sample higher
37 |   than the number of samples, controlled by the ``max_samples`` keyword argument by `Adam Li`_ (:pr:`#206`).
38 | - |Feature| :class:`treeple.HonestForestClassifier` now allows one to specify
39 |   the number of sub-samples to use for the honest trees without having
40 |   to bootstrap sample. This is specified by the ``max_samples`` parameter.
41 |   By `Adam Li`_ (:pr:`#210`)
42 | 
43 | Code and Documentation Contributors
44 | -----------------------------------
45 | 
46 | Thanks to everyone who has contributed to the maintenance and improvement of
47 | the project since version inception, including:
48 | 
49 | * `Adam Li`_
50 | * `Haoyin Xu`_
51 | 


--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
 1 | project(
 2 |   'treeple',
 3 |   'c', 'cpp', 'cython',
 4 |   # Note that the git commit hash cannot be added dynamically here
 5 |   # That only happens when importing from a git repository.
 6 |   # See `treeple/__init__.py`
 7 |   version: '0.10.3',
 8 |   license: 'PolyForm Noncommercial 1.0.0',
 9 |   meson_version: '>= 1.1.0',
10 |   default_options: [
11 |     'c_std=c11',
12 |     'cpp_std=c++14',
13 |   ],
14 | )
15 | 
16 | cc = meson.get_compiler('c')
17 | cpp = meson.get_compiler('cpp')
18 | 
19 | # Check compiler is recent enough (see "Toolchain Roadmap" for details)
20 | if cc.get_id() == 'gcc'
21 |   if not cc.version().version_compare('>=8.0')
22 |     error('treeple requires GCC >= 8.0')
23 |   endif
24 | elif cc.get_id() == 'msvc'
25 |   if not cc.version().version_compare('>=19.20')
26 |     error('treeple requires at least vc142 (default with Visual Studio 2019) ' + \
27 |           'when building with MSVC')
28 |   endif
29 | endif
30 | 
31 | # Suppress warning for deprecated Numpy API.
32 | # Replace with numpy_nodepr_api after Cython 3.0 is out
33 | # '-Wno-maybe-uninitialized'
34 | # numpy_nodepr_api = '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION'
35 | 
36 | # (Suppress warning messages emitted by #warning directives).
37 | _global_c_args = cc.get_supported_arguments(
38 |   '-Wno-unused-but-set-variable',
39 |   '-Wno-unused-function',
40 |   '-Wno-conversion',
41 |   '-Wno-misleading-indentation',
42 |   '-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION',
43 | )
44 | add_project_arguments(_global_c_args, language : 'c')
45 | 
46 | # We need -lm for all C code (assuming it uses math functions, which is safe to
47 | # assume for treeple). For C++ it isn't needed, because libstdc++/libc++ is
48 | # guaranteed to depend on it. For Fortran code, Meson already adds `-lm`.
49 | m_dep = cc.find_library('m', required : false)
50 | if m_dep.found()
51 |   add_project_link_arguments('-lm', language : 'c')
52 | endif
53 | 
54 | cython = find_program(
55 |   'cython',
56 |   required: true
57 | )
58 | if not cython.found()
59 |   error('MESON_BUILD_FAILED: Cython3 not found. Please install it.')
60 | endif
61 | 
62 | # r = run_command('git', 'submodule', 'update', '--init', check: false)
63 | r = run_command('mv', 'treeple/_lib/sklearn_fork/sklearn', 'treeple/_lib/sklearn', check: false)
64 | 
65 | # Setup Python:
66 | # https://mesonbuild.com/Python-module.html
67 | py = import('python').find_installation(pure: false)
68 | 
69 | # print some debugging output
70 | message(py.full_path())
71 | message(py.get_install_dir())
72 | if py.language_version().version_compare('<3.9')
73 |     error('At least Python 3.9 is required.')
74 | endif
75 | 
76 | subdir('treeple')
77 | 


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_criterion.pxd:
--------------------------------------------------------------------------------
 1 | # cython: boundscheck=False
 2 | # cython: wraparound=False
 3 | # cython: language_level=3
 4 | 
 5 | from ..._lib.sklearn.tree._criterion cimport BaseCriterion
 6 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t
 7 | 
 8 | # Note: This class is an exact copy of scikit-learn's Criterion
 9 | # class, with the exception of the type of the internal structure.
10 | # In scikit-learn, they store a buffer for the y-labels, whereas here
11 | # we store a buffer for the X dataset.
12 | #
13 | # In our criterions, we do not store the 'y-labels' because there are none
14 | # in unsupervised learning. We instead store a memview of the dataset 'X'.
15 | 
16 | 
17 | cdef class UnsupervisedCriterion(BaseCriterion):
18 |     """Abstract unsupervised criterion.
19 | 
20 |     Notable Changes
21 |     ---------------
22 |     1. weighted_n_* : This parameter keeps track of the total "weight" of the samples
23 |         in the node, left and right
24 |     """
25 | 
26 |     # The criterion computes the impurity of a node and the reduction of
27 |     # impurity of a split on that node. It also computes the output statistics.
28 | 
29 |     # Internal structures
30 |     cdef const float32_t[:] feature_values  # 1D memview for the feature vector to compute criterion on
31 | 
32 |     # Keep running total of Xf[samples[start:end]] and the corresponding sum in
33 |     # the left and right node. For example, this can then efficiently compute the
34 |     # mean of the node, and left/right child by subtracting relevant Xf elements
35 |     # and then dividing by the total number of samples in the node and left/right child.
36 |     cdef float64_t sum_total     # The sum of the weighted count of each feature.
37 |     cdef float64_t sum_left      # Same as above, but for the left side of the split
38 |     cdef float64_t sum_right     # Same as above, but for the right side of the split
39 | 
40 |     cdef float64_t sumsq_total     # The sum of the weighted count of each feature.
41 |     cdef float64_t sumsq_left      # Same as above, but for the left side of the split
42 |     cdef float64_t sumsq_right     # Same as above, but for the right side of the split
43 | 
44 |     # Methods
45 |     # -------
46 |     # The 'init' method is copied here with the almost the exact same signature
47 |     # as that of supervised learning criterion in scikit-learn to ensure that
48 |     # Unsupervised criterion can be used with splitter and tree methods.
49 |     cdef intp_t init(
50 |         self,
51 |         const float32_t[:] feature_values,
52 |         const float64_t[:] sample_weight,
53 |         float64_t weighted_n_samples,
54 |         const intp_t[:] samples,
55 |     ) except -1 nogil
56 | 
57 |     cdef void init_feature_vec(
58 |         self
59 |     ) noexcept nogil
60 | 
61 |     cdef void set_sample_pointers(
62 |         self,
63 |         intp_t start,
64 |         intp_t end
65 |     ) noexcept nogil
66 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: 'bug'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | Please fill this template entirely and do not erase any of it.
12 | We reserve the right to close without a response bug reports which are incomplete.
13 | -->
14 | 
15 | ## Checklist
16 | 
17 | <!-- To check an item on the list replace [ ] with [x]. -->
18 | 
19 | - [ ] I have verified that the issue exists against the `main` branch.
20 | - [ ] I have read the relevant section in the [contribution guide](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#bug-reports-and-feature-requests) on reporting bugs.
21 | - [ ] I have checked the [issues list](https://github.com/py-why/pywhy-graphs/issues) for similar or identical bug reports.
22 | - [ ] I have checked the [pull requests list](https://github.com/py-why/pywhy-graphs/pulls) for existing proposed fixes.
23 | - [ ] I have checked the [CHANGELOG](https://github.com/py-why/pywhy-graphs/blob/main/CHANGELOG.md) and the [commit log](https://github.com/py-why/pywhy-graphs/commits/main) to find out if the bug was already fixed in the main branch.
24 | - [ ] I have included in the "Description" section below a traceback from any exceptions related to this bug.
25 | - [ ] I have included in the "Related issues or possible duplicates" section beloew all related issues and possible duplicate issues (If there are none, check this box anyway).
26 | - [ ] I have included in the "Environment" section below the name of the operating system and Python version that I was using when I discovered this bug.
27 | - [ ] I have included in the "Environment" section below the output of `pip freeze`.
28 | - [ ] I have included in the "Steps to reproduce" section below a minimally reproducible example.
29 | 
30 | 
31 | ## Description
32 | 
33 | <!-- Please provide a clear and concise description of what the bug is here. -->
34 | 
35 | <details>
36 | <summary><b>Python traceback:</b></summary>
37 | <p>
38 | 
39 | <!-- Paste the traceback from any exception (if there was one) in between the next two lines below -->
40 | ```
41 | ```
42 | 
43 | </p>
44 | </details>
45 | 
46 | 
47 | ## Related issues or possible duplicates
48 | 
49 | - None
50 | 
51 | 
52 | ## Environment
53 | 
54 | <!-- Provide the name of operating system below (e.g. OS X, Linux) -->
55 | OS:
56 | 
57 | <!-- Provide the Python version you were using (e.g. 3.7.1) -->
58 | Python version:
59 | 
60 | <details>
61 | <summary><b>Output of <code>pip freeze</code>:</b></summary>
62 | <p>
63 | 
64 | <!-- Paste the output of `pip freeze` in between the next two lines below -->
65 | ```
66 | ```
67 | 
68 | </p>
69 | </details>
70 | 
71 | 
72 | ## Steps to reproduce
73 | 
74 | 
75 | <details>
76 | <summary><b>Example source:</b></summary>
77 | <p>
78 | 
79 | <!-- Add a fully runnable example in between the next two lines below that will reproduce the bug -->
80 | ```
81 | ```
82 | 
83 | </p>
84 | </details>
85 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # simple makefile to simplify repetitive build env management tasks under posix
 2 | 
 3 | # caution: testing won't work on windows, see README
 4 | 
 5 | PYTHON ?= python
 6 | PYTESTS ?= pytest
 7 | CTAGS ?= ctags
 8 | CODESPELL_SKIPS ?= "*.fif,*.eve,*.gz,*.tgz,*.zip,*.mat,*.stc,*.label,*.w,*.bz2,*.annot,*.sulc,*.log,*.local-copy,*.orig_avg,*.inflated_avg,*.gii,*.pyc,*.doctree,*.pickle,*.inv,*.png,*.edf,*.touch,*.thickness,*.nofix,*.volume,*.defect_borders,*.mgh,lh.*,rh.*,COR-*,FreeSurferColorLUT.txt,*.examples,.xdebug_mris_calc,bad.segments,BadChannels,*.hist,empty_file,*.orig,*.js,*.map,*.ipynb,searchindex.dat,plot_*.rst,*.rst.txt,*.html,gdf_encodes.txt,treeple/_lib/*,doc/auto_examples/*"
 9 | CODESPELL_DIRS ?= treeple/ doc/ examples/ benchmarks/
10 | all: clean inplace test test-doc
11 | 
12 | clean-pyc:
13 | 	find . -name "*.pyc" | xargs rm -f
14 | 
15 | clean-build:
16 | 	rm -rf build
17 | 	rm -rf dist
18 | 
19 | clean-cache:
20 | 	find . -name "__pycache__" | xargs rm -rf
21 | 
22 | clean: clean-build clean-pyc clean-cache
23 | 
24 | pytest: test
25 | 
26 | test: in
27 | 	rm -f .coverage
28 | 	$(PYTESTS) treeple
29 | 
30 | test-doc: sample_data testing_data
31 | 	$(PYTESTS) --doctest-modules --doctest-ignore-import-errors --doctest-glob='*.rst' ./doc/
32 | 
33 | flake:
34 | 	@if command -v flake8 > /dev/null; then \
35 | 		echo "Running flake8"; \
36 | 		flake8 --count treeple examples; \
37 | 	else \
38 | 		echo "flake8 not found, please install it!"; \
39 | 		exit 1; \
40 | 	fi;
41 | 	@echo "flake8 passed"
42 | 
43 | black:
44 | 	@if command -v black > /dev/null; then \
45 | 		echo "Running black"; \
46 | 		black treeple examples; \
47 | 	else \
48 | 		echo "black not found, please install it!"; \
49 | 		exit 1; \
50 | 	fi;
51 | 	@echo "black passed"
52 | 
53 | isort:
54 | 	@if command -v isort > /dev/null; then \
55 | 		echo "Running isort"; \
56 | 		isort treeple examples doc; \
57 | 	else \
58 | 		echo "isort not found, please install it!"; \
59 | 		exit 1; \
60 | 	fi;
61 | 	@echo "isort passed"
62 | 
63 | codespell:  # running manually
64 | 	@codespell -w -i 3 -q 3 -S $(CODESPELL_SKIPS) --ignore-words=.codespellignore $(CODESPELL_DIRS)
65 | 
66 | codespell-error:  # running on travis
67 | 	@codespell -i 0 -q 7 -S $(CODESPELL_SKIPS) --ignore-words=.codespellignore $(CODESPELL_DIRS)
68 | 
69 | pydocstyle:
70 | 	@echo "Running pydocstyle"
71 | 	@pydocstyle mne
72 | 
73 | docstyle: pydocstyle
74 | 
75 | build-doc:
76 | 	@echo "Building documentation"
77 | 	make -C doc/ clean
78 | 	make -C doc/ html
79 | 	cd doc/ && make view
80 | 
81 | build-doc-noplot:
82 | 	@echo "Building documentation"
83 | 	make -C doc/ clean
84 | 	make -C doc/ html-noplot
85 | 	cd doc/ && make view
86 | 
87 | run-checks:
88 | 	isort --check .
89 | 	black --check treeple examples
90 | 	flake8 .
91 | 	mypy ./treeple
92 | 	@$(MAKE) pydocstyle
93 | 	@$(MAKE) codespell-error
94 | 	ruff .
95 | 	toml-sort ./pyproject.toml --check
96 | 	yamllint . -c .yamllint.yml --strict
97 | 
98 | pre-commit:
99 | 	@pre-commit run -a


--------------------------------------------------------------------------------
/doc/sphinxext/github_link.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | from functools import partial
 6 | from operator import attrgetter
 7 | 
 8 | REVISION_CMD = "git rev-parse --short HEAD"
 9 | 
10 | 
11 | def _get_git_revision():
12 |     try:
13 |         revision = subprocess.check_output(REVISION_CMD.split()).strip()
14 |     except (subprocess.CalledProcessError, OSError):
15 |         print("Failed to execute git to get revision")
16 |         return None
17 |     return revision.decode("utf-8")
18 | 
19 | 
20 | def _linkcode_resolve(domain, info, package, url_fmt, revision):
21 |     """Determine a link to online source for a class/method/function
22 | 
23 |     This is called by sphinx.ext.linkcode
24 | 
25 |     An example with a long-untouched module that everyone has
26 |     >>> _linkcode_resolve('py', {'module': 'tty',
27 |     ...                          'fullname': 'setraw'},
28 |     ...                   package='tty',
29 |     ...                   url_fmt='http://hg.python.org/cpython/file/'
30 |     ...                           '{revision}/Lib/{package}/{path}#L{lineno}',
31 |     ...                   revision='xxxx')
32 |     'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
33 |     """
34 | 
35 |     if revision is None:
36 |         return
37 |     if domain not in ("py", "pyx"):
38 |         return
39 |     if not info.get("module") or not info.get("fullname"):
40 |         return
41 | 
42 |     class_name = info["fullname"].split(".")[0]
43 |     module = __import__(info["module"], fromlist=[class_name])
44 |     obj = attrgetter(info["fullname"])(module)
45 | 
46 |     # Unwrap the object to get the correct source
47 |     # file in case that is wrapped by a decorator
48 |     obj = inspect.unwrap(obj)
49 | 
50 |     try:
51 |         fn = inspect.getsourcefile(obj)
52 |     except Exception:
53 |         fn = None
54 |     if not fn:
55 |         try:
56 |             fn = inspect.getsourcefile(sys.modules[obj.__module__])
57 |         except Exception:
58 |             fn = None
59 |     if not fn:
60 |         return
61 | 
62 |     fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__))
63 |     try:
64 |         lineno = inspect.getsourcelines(obj)[1]
65 |     except Exception:
66 |         lineno = ""
67 |     return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno)
68 | 
69 | 
70 | def make_linkcode_resolve(package, url_fmt):
71 |     """Returns a linkcode_resolve function for the given URL format
72 | 
73 |     revision is a git commit reference (hash or name)
74 | 
75 |     package is the name of the root module of the package
76 | 
77 |     url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
78 |                                    'blob/{revision}/{package}/'
79 |                                    '{path}#L{lineno}')
80 |     """
81 |     revision = _get_git_revision()
82 |     return partial(_linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt)
83 | 


--------------------------------------------------------------------------------
/doc/install.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | Dependencies
 7 | ------------
 8 | 
 9 | * ``numpy`` (>=1.23)
10 | * ``scipy`` (>=1.5.0)
11 | * ``scikit-learn`` (>=1.3)
12 | * ``joblib`` (>=1.0.0)
13 | * ``matplotlib`` (optional)
14 | 
15 | **treeple** supports Python >= 3.9.
16 | 
17 | Installing with ``pip``
18 | -----------------------
19 | 
20 | **treeple** is available on `PyPI <https://pypi.org/project/treeple/>`_. Just run
21 | 
22 | .. code-block:: bash
23 | 
24 |     pip install treeple
25 | 
26 | Installing from source with Meson
27 | ---------------------------------
28 | 
29 | To install **treeple** from source, first clone the `repository <https://github.com/neurodata/treeple>`_:
30 | 
31 | .. code-block:: bash
32 | 
33 |     git clone https://github.com/neurodata/treeple.git
34 |     cd treeple
35 | 
36 |     # ideally, you should always start within a virtual environment
37 |     conda create -n sklearn-dev python=3.9
38 |     conda activate sklearn-dev
39 | 
40 | Then run installation of build packages
41 | 
42 | .. code-block:: bash
43 | 
44 |     pip install -r build_requirements.txt
45 |     pip install spin
46 | 
47 |     # use spin CLI to run Meson build locally
48 |     ./spin build -j 2
49 | 
50 |     # you can now run tests
51 |     ./spin test
52 | 
53 | via pip, you will be able to install in editable mode (pending Meson-Python support).
54 | 
55 | .. code-block:: bash
56 | 
57 |     pip install -e .
58 | 
59 |     # if editing Cython files
60 |     pip install --verbose --no-build-isolation --editable .
61 | 
62 | .. code-block:: bash
63 | 
64 |    pip install --user -U https://api.github.com/repos/neurodata/treeple/zipball/master
65 | 
66 | Conda (Recommended)
67 | -------------------
68 | First, create a virtual environment using Conda.
69 | 
70 |     conda create -n sklearn-dev python=3.9
71 | 
72 | # activate the virtual environment and install necessary packages to build from source
73 | 
74 |     conda activate sklearn-dev
75 |     conda install -c conda-forge numpy scipy cython joblib threadpoolctl pytest compilers llvm-openmp
76 | 
77 | Next, `treeple` from source:
78 | 
79 |     pip install .[build]
80 | 
81 |     # if editing Cython files
82 |     pip install --verbose --no-build-isolation --editable .
83 | 
84 | To install the package from github, clone the repository and then `cd` into the directory.:
85 | 
86 |     ./spin build
87 | 
88 |     # if you would like an editable install of treeple for dev purposes
89 |     pip install --verbose --no-build-isolation --editable .
90 | 
91 |     pip install https://api.github.com/repos/neurodata/treeple/zipball/main
92 | 
93 | 
94 |     pip install https://api.github.com/repos/neurodata/scikit-learn/zipball/obliquepr
95 | 
96 | Note that currently, we need to build the development version of scikit-learn with oblique trees within this `PR <https://github.com/scikit-learn/scikit-learn/pull/22754>`_.
97 | 
98 | Checkout this PR code, and build from source, using scikit-learn's build from source page instructions.
99 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | **treeple**
 2 | ===================
 3 | treeple is a package for modern tree-based algorithms for supervised and unsupervised
 4 | learning problems. It extends the robust API of `scikit-learn <https://github.com/scikit-learn/scikit-learn>`_
 5 | for tree algorithms that achieve strong performance in benchmark tasks.
 6 | 
 7 | Our package has implemented unsupervised forests (Geodesic Forests
 8 | [Madhyastha2020]_), oblique random forests (SPORF [Tomita2020]_, manifold random forests,
 9 | MORF [Li2023]_), honest forests [Perry2021]_, extended isolation forests [Hariri2019]_, and more.
10 | 
11 | For all forests, we also support incremental building of the forests, using the
12 | ``partial_fit`` API from scikit-learn [Xu2022]_, and quantile regression by storing
13 | the training samples in the leaves of the trees [Meinshausen2006]_ (Warning: high memory usage
14 | will occur in this setting since predicting quantiles stores the training data within the
15 | leaves of the tree).
16 | 
17 | We encourage you to use the package for your research and also build on top
18 | with relevant Pull Requests. See our examples for walk-throughs of how to use the package.
19 | Also, see our `contributing guide <https://github.com/neurodata/treeple/blob/main/CONTRIBUTING.md>`_.
20 | 
21 | We are licensed under PolyForm Noncommercial License (see `License <https://github.com/neurodata/treeple/blob/main/LICENSE>`_).
22 | 
23 | .. topic:: References
24 | 
25 |  .. [Hariri2019] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
26 |    "Extended isolation forest." IEEE transactions on knowledge and data
27 |    engineering 33.4 (2019): 1479-1489.
28 | 
29 |  .. [Meinshausen2006] Meinshausen, Nicolai, and Greg Ridgeway. "Quantile regression forests."
30 |    Journal of machine learning research 7.6 (2006). "Quantile regression forests."
31 | 
32 |  .. [Madhyastha2020] Madhyastha, Meghana, et al. :doi:`"Geodesic Forests"
33 |     <10.1145/3394486.3403094>`, KDD 2020, 513-523, 2020.
34 | 
35 |  .. [Tomita2020] Tomita, Tyler M., et al. "Sparse Projection Oblique
36 |     Randomer Forests", The Journal of Machine Learning Research, 21(104),
37 |     1-39, 2020.
38 | 
39 |  .. [Li2023] Li, Adam, et al. :doi:`"Manifold Oblique Random Forests: Towards
40 |     Closing the Gap on Convolutional Deep Networks" <10.1137/21M1449117>`,
41 |     SIAM Journal on Mathematics of Data Science, 5(1), 77-96, 2023.
42 | 
43 |  .. [Perry2021] Perry, Ronan, et al. :arxiv:`"Random Forests for Adaptive
44 |     Nearest Neighbor Estimation of Information-Theoretic Quantities"
45 |     <1907.00325>`, arXiv preprint arXiv:1907.00325, 2021.
46 | 
47 |  .. [Xu2022] Xu, Haoyin, et al. :arxiv:`"Simplest Streaming Trees"
48 |     <2110.08483>`, arXiv preprint arXiv:2110.08483, 2022.
49 | 
50 | Contents
51 | --------
52 | 
53 | .. toctree::
54 |    :maxdepth: 2
55 |    :caption: Getting started:
56 | 
57 |    api
58 |    User Guide<user_guide>
59 |    whats_new
60 |    install
61 |    use
62 | 
63 | Indices and tables
64 | ------------------
65 | 
66 | * :ref:`genindex`
67 | * :ref:`modindex`
68 | 


--------------------------------------------------------------------------------
/doc/modules/unsupervised_tree.rst:
--------------------------------------------------------------------------------
 1 | .. _unsupervised_tree:
 2 | 
 3 | ===========================
 4 | Unsupervised Decision Trees
 5 | ===========================
 6 | 
 7 | .. currentmodule:: sklearn.tree
 8 | 
 9 | In unsupervised learning, the goal is to identify patterns
10 | or structure in data without using labeled examples. Clustering is a common
11 | unsupervised learning technique that groups similar examples together
12 | based on their features. Unsupervised tree models are an adaptive way of generating
13 | clusters of samples. For information on supervised tree models, see :ref:`supervised_tree`
14 | 
15 | In this guide, we overview the :ref:`unsup_criterion` used for splitting unsupervised trees,
16 | and methods for evaluating the quality of the tree model in :ref:`unsup_evaluation`.
17 | 
18 | .. _unsup_criterion:
19 | 
20 | Unsupervised Criterion
21 | ----------------------
22 | 
23 | Unsupervised tree models use a variety of criteria to split nodes.
24 | 
25 | Two-Means
26 | ~~~~~~~~~
27 | 
28 | The two means split finds the cutpoint that minimizes the one-dimensional
29 | 2-means objective, which is finding the cutoff point where the total variance
30 | from cluster 1 and cluster 2 are minimal.
31 | 
32 | .. math::
33 |   \min_s \sum_{i=1}^s (x_i - \hat{\mu}_1)^2 + \sum_{i=s+1}^N (x_i - \hat{\mu}_2)^2
34 | 
35 | where x is a N-dimensional feature vector, N is the number of sample_indices and
36 | the \mu terms are the estimated means of each cluster 1 and 2.
37 | 
38 | Fast-BIC
39 | ~~~~~~~~
40 | 
41 | The Bayesian Information Criterion (BIC) is a popular model seleciton
42 | criteria that is based on the log likelihood of the model given data.
43 | Fast-BIC :footcite:`Meghana2019_geodesicrf` is a method that combines the speed of the
44 | :class:`sklearn.cluster.KMeans` clustering method with the model flexibility
45 | of Mclust-BIC. It sorts data for each feature and tries all possible splits to
46 | assign data points to one of two Gaussian distributions based on their position
47 | relative to the split.
48 | The parameters for each cluster are estimated using maximum likelihood
49 | estimation (MLE).The method performs hard clustering rather than soft
50 | clustering like in GMM, resulting in a simpler calculation of the likelihood.
51 | 
52 | .. math::
53 | 
54 |   \hat{L} = \sum_{n=1}^s[\log\hat{\pi}_1+\log{\mathcal{N}(x_n;\hat{\mu}_1,\hat{\sigma}_1^2)}]
55 |   + \sum_{n=s+1}^N[\log\hat{\pi}_2+\log{\mathcal{N}(x_n;\hat{\mu}_2,\hat{\sigma}_2^2)}]
56 | 
57 | where the prior, mean, and variance are defined as follows, respectively:
58 | 
59 | .. math::
60 | 
61 |   \hat{\pi} = \frac{s}{N},\quad\quad
62 |   \hat{\mu} = \frac{1}{s}\sum_{n\le s}{x_n},\quad\quad
63 |   \hat{\sigma}^2 = \frac{1}{s}\sum_{n\le s}{||x_n-\hat{\mu_j}||^2}
64 | 
65 | .. _unsup_evaluation:
66 | 
67 | Evaluating Unsupervised Trees
68 | -----------------------------
69 | 
70 | In clustering settings, there may be no natural
71 | notion of “true” class-labels, thus the efficacy of the clustering scheme is
72 | often measured based on distance based metrics such as :func:`sklearn.metrics.adjusted_rand_score`.
73 | 
74 | .. topic:: References
75 | 
76 |  .. footbibliography::
77 | 


--------------------------------------------------------------------------------
/doc/whats_new/v0.7.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | .. include:: _contributors.rst
 4 | .. currentmodule:: treeple
 5 | 
 6 | .. _v0_7:
 7 | 
 8 | Version 0.7
 9 | ===========
10 | 
11 | This release adds the ability to separate in-bag and out-of-bag samples for
12 | any forest model. We also introduce a new class for fitting honest forests while
13 | permuting the covariate index, and a new set of simulations based on Marron and Wand 1992.
14 | 
15 | In addition, various patches were made in terms of how to use scikit-tree for hypothesis
16 | testing of feature sets.
17 | 
18 | Changelog
19 | ---------
20 | 
21 | - |Feature| Introduce a new light-weight class for fitting honest forests while
22 |     permuting the covariate index :class:`treeple.stats.PermutationHonestForestClassifier`,
23 |     by `Adam Li`_ (:pr:`#211`)
24 | - |Feature| Introduce a new class method ``predict_proba_per_tree`` for all
25 |     Forest classifiers, which will predict the probability per tree and keep the
26 |     output as a ``(n_estimators, n_samples, n_classes)`` output,
27 |     by `Adam Li`_ (:pr:`#211`)
28 | - |Feature| Introduce a new class fitted attribute ``oob_samples_`` for all
29 |     Forest models, which will keep track of the samples used.
30 |     by `Adam Li`_ (:pr:`#211`)
31 | - |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
32 |     by `Sambit Panda`_ (:pr:`#203`)
33 | - |Feature| :func:`treeple.stats.build_coleman_forest` and :func:`treeple.stats.build_permutation_forest`
34 |     are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`)
35 | - |API| :func:`treeple.datasets.make_trunk_classification` for generating trunk mixture and Marron-Wand
36 |     simulations are separated out into :func:`treeple.datasets.make_marron_wand_classification` and
37 |     :func:`treeple.datasets.make_trunk_mixture_classification`, `Adam Li`_ (:pr:`#227`)
38 | - |API| :class:`treeple.HonestForestClassifier` and :class:`treeple.tree.HonestTreeClassifier`
39 |     now overwrite all parameters set by the underlying ``tree_estimator`` and allow you to directly
40 |     pass any extra parameters that ``tree_estimator`` has compared to the original
41 |     :class:`~sklearn.tree.DecisionTreeClassifier`, `Adam Li`_ (:pr:`#228`)
42 | - |Fix| Trunk simulators now correctly generate random values with a fixed seed,
43 |     by `Sambit Panda`_ (:pr:`#236`)
44 | - |Fix| Trunk simulators now correctly generate random values with a fixed seed,
45 |     by `Sambit Panda`_ (:pr:`#236`)
46 | - |Efficiency| All scikit-tree estimators are now at least 2X faster than they were
47 |     in previous versions. This was due to adding in compiler-directives to turn on 
48 |     optimizations '-03' when compiling the C++ generated code from Cython. In addition,
49 |     we explicitly turned off bounds-checking and related runtime checks in the Cython code,
50 |     which would lead to performance degradation during runtime. by `Adam Li`_ (:pr:`#242`)
51 | 
52 | Code and Documentation Contributors
53 | -----------------------------------
54 | 
55 | Thanks to everyone who has contributed to the maintenance and improvement of
56 | the project since version inception, including:
57 | 
58 | * `Adam Li`_
59 | * `Sambit Panda`_
60 | 


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_tree.pxd:
--------------------------------------------------------------------------------
 1 | # Authors: Adam Li <adam2392@gmail.com>
 2 | #          Jong Shin <jshinm@gmail.com>
 3 | #
 4 | 
 5 | # License: BSD 3 clause
 6 | 
 7 | # See _unsup_tree.pyx for details.
 8 | 
 9 | import numpy as np
10 | 
11 | cimport numpy as cnp
12 | 
13 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
14 | from ..._lib.sklearn.tree._tree cimport BaseTree, Node, ParentInfo
15 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t
16 | from ._unsup_splitter cimport UnsupervisedSplitter
17 | 
18 | 
19 | # TODO: copy changes from https://github.com/scikit-learn/scikit-learn/pull/25540/files
20 | cdef class UnsupervisedTree(BaseTree):
21 |     # The Tree object is a binary tree structure constructed by the
22 |     # TreeBuilder. The tree structure is used for predictions and
23 |     # feature importances.
24 |     #
25 |     # Inner structures: values are stored separately from node structure,
26 |     # since size is determined at runtime.
27 |     # cdef float64_t* value                   # (capacity) array of values
28 |     # cdef intp_t value_stride             # = 1
29 | 
30 |     # Input/Output layout
31 |     cdef public intp_t n_features        # Number of features in X
32 | 
33 |     # Methods
34 |     cdef cnp.ndarray _get_value_ndarray(self)
35 |     cdef cnp.ndarray _get_node_ndarray(self)
36 | 
37 |     # Overridden Methods
38 |     cdef int _set_split_node(
39 |         self,
40 |         SplitRecord* split_node,
41 |         Node* node,
42 |         intp_t node_id
43 |     ) except -1 nogil
44 |     cdef float32_t _compute_feature(
45 |         self,
46 |         const float32_t[:, :] X_ndarray,
47 |         intp_t sample_index,
48 |         Node *node
49 |     ) noexcept nogil
50 |     cdef void _compute_feature_importances(
51 |         self,
52 |         cnp.float64_t[:] importances,
53 |         Node* node
54 |     ) noexcept nogil
55 | 
56 | # =============================================================================
57 | # Tree builder
58 | # =============================================================================
59 | 
60 | cdef class UnsupervisedTreeBuilder:
61 |     # The TreeBuilder recursively builds a Tree object from training samples,
62 |     # using a Splitter object for splitting internal nodes and assigning
63 |     # values to leaves.
64 |     #
65 |     # This class controls the various stopping criteria and the node splitting
66 |     # evaluation order, e.g. depth-first or best-first.
67 | 
68 |     cdef UnsupervisedSplitter splitter  # Splitting algorithm
69 | 
70 |     cdef intp_t min_samples_split       # Minimum number of samples in an internal node
71 |     cdef intp_t min_samples_leaf        # Minimum number of samples in a leaf
72 |     cdef float64_t min_weight_leaf         # Minimum weight in a leaf
73 |     cdef intp_t max_depth               # Maximal tree depth
74 |     cdef float64_t min_impurity_decrease   # Impurity threshold for early stopping
75 | 
76 |     cpdef build(
77 |         self,
78 |         UnsupervisedTree tree,
79 |         object X,
80 |         const float64_t[:] sample_weight=*
81 |     )
82 |     cdef _check_input(
83 |         self,
84 |         object X,
85 |         const float64_t[:] sample_weight
86 |     )
87 | 


--------------------------------------------------------------------------------
/treeple/tree/tests/test_honest_prune.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from treeple.tree import HonestTreeClassifier
 4 | 
 5 | 
 6 | def test_honest_tree_pruning():
 7 |     """Test honest tree with pruning to ensure no empty leaves."""
 8 |     rng = np.random.default_rng(1234)
 9 | 
10 |     n_samples = 1000
11 |     X = rng.standard_normal(size=(n_samples, 100))
12 |     X[n_samples // 2 :] *= -1
13 |     y = [0] * (n_samples // 2) + [1] * (n_samples // 2)
14 | 
15 |     clf = HonestTreeClassifier(honest_method="prune", max_features="sqrt", random_state=0)
16 |     clf = clf.fit(X, y)
17 | 
18 |     nonprune_clf = HonestTreeClassifier(
19 |         honest_method="apply", max_features="sqrt", random_state=0, honest_prior="ignore"
20 |     )
21 |     nonprune_clf = nonprune_clf.fit(X, y)
22 | 
23 |     assert (
24 |         nonprune_clf.tree_.max_depth >= clf.tree_.max_depth
25 |     ), f"{nonprune_clf.tree_.max_depth} <= {clf.tree_.max_depth}"
26 |     # assert np.all(clf.tree_.children_left != -1)
27 | 
28 |     # Access the original and pruned trees' attributes
29 |     original_tree = nonprune_clf.tree_
30 |     pruned_tree = clf.tree_
31 | 
32 |     # Ensure the pruned tree has fewer or equal nodes
33 |     assert (
34 |         pruned_tree.node_count < original_tree.node_count
35 |     ), "Pruned tree has more nodes than the original tree"
36 | 
37 |     # Ensure the pruned tree has no empty leaves
38 |     assert np.all(pruned_tree.value.sum(axis=(1, 2)) > 0), pruned_tree.value.sum(axis=(1, 2))
39 |     # assert np.all(original_tree.value.sum(axis=(1,2)) > 0), original_tree.value.sum(axis=(1,2))
40 |     assert np.all(pruned_tree.value.sum(axis=(1, 2)) > 0) > np.all(
41 |         original_tree.value.sum(axis=(1, 2)) > 0
42 |     )
43 | 
44 |     # test that the first three nodes are the same, since these are unlikely to be
45 |     # pruned, and should remain invariant.
46 |     #
47 |     # Note: pruning the tree will have the node_ids change since the tree is
48 |     # ordered via DFS.
49 |     for pruned_node_id in range(3):
50 |         pruned_left_child = pruned_tree.children_left[pruned_node_id]
51 |         pruned_right_child = pruned_tree.children_right[pruned_node_id]
52 | 
53 |         # Check if the pruned node exists in the original tree
54 |         assert (
55 |             pruned_left_child in original_tree.children_left
56 |         ), "Left child node of pruned tree not found in original tree"
57 |         assert (
58 |             pruned_right_child in original_tree.children_right
59 |         ), "Right child node of pruned tree not found in original tree"
60 | 
61 |         # Check if the node's parameters match for non-leaf nodes
62 |         if pruned_left_child != -1:
63 |             assert (
64 |                 pruned_tree.feature[pruned_node_id] == original_tree.feature[pruned_node_id]
65 |             ), "Feature does not match for node {}".format(pruned_node_id)
66 |             assert (
67 |                 pruned_tree.threshold[pruned_node_id] == original_tree.threshold[pruned_node_id]
68 |             ), "Threshold does not match for node {}".format(pruned_node_id)
69 |             assert (
70 |                 pruned_tree.weighted_n_node_samples[pruned_node_id]
71 |                 == original_tree.weighted_n_node_samples[pruned_node_id]
72 |             ), "Weighted n_node samples does not match for node {}".format(pruned_node_id)
73 | 


--------------------------------------------------------------------------------
/treeple/tests/test_neighbors.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from sklearn.datasets import make_blobs, make_classification
  4 | from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
  5 | from sklearn.neighbors import NearestNeighbors
  6 | from sklearn.tree import (
  7 |     DecisionTreeClassifier,
  8 |     DecisionTreeRegressor,
  9 |     ExtraTreeClassifier,
 10 |     ExtraTreeRegressor,
 11 | )
 12 | from sklearn.utils.estimator_checks import parametrize_with_checks
 13 | 
 14 | from treeple.ensemble import (
 15 |     ObliqueRandomForestClassifier,
 16 |     PatchObliqueRandomForestClassifier,
 17 |     UnsupervisedObliqueRandomForest,
 18 |     UnsupervisedRandomForest,
 19 | )
 20 | from treeple.neighbors import NearestNeighborsMetaEstimator
 21 | 
 22 | FORESTS = [
 23 |     ObliqueRandomForestClassifier,
 24 |     PatchObliqueRandomForestClassifier,
 25 |     UnsupervisedRandomForest,
 26 |     UnsupervisedObliqueRandomForest,
 27 | ]
 28 | 
 29 | 
 30 | @pytest.mark.parametrize("forest", FORESTS)
 31 | def test_similarity_matrix(forest):
 32 |     n_samples = 200
 33 |     n_classes = 2
 34 |     n_features = 5
 35 | 
 36 |     X, y = make_blobs(
 37 |         n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=12345
 38 |     )
 39 | 
 40 |     clf = forest(random_state=12345)
 41 |     clf.fit(X, y)
 42 |     sim_mat = clf.compute_similarity_matrix(X)
 43 | 
 44 |     assert sim_mat.shape == (n_samples, n_samples)
 45 |     assert np.allclose(sim_mat, sim_mat.T)
 46 |     assert np.all((sim_mat.diagonal() == 1))
 47 | 
 48 | 
 49 | @pytest.fixture
 50 | def sample_data():
 51 |     # Generate sample data for testing
 52 |     X, y = make_classification(n_samples=100, n_features=10, random_state=42)
 53 |     return X, y
 54 | 
 55 | 
 56 | @pytest.mark.parametrize(
 57 |     "estimator",
 58 |     [
 59 |         DecisionTreeClassifier(random_state=0),
 60 |         DecisionTreeRegressor(random_state=0),
 61 |         ExtraTreeClassifier(random_state=0),
 62 |         ExtraTreeRegressor(random_state=0),
 63 |         RandomForestClassifier(random_state=0, n_estimators=10),
 64 |         ExtraTreesClassifier(random_state=0, n_estimators=10),
 65 |     ],
 66 | )
 67 | def test_nearest_neighbors_meta_estimator(sample_data, estimator):
 68 |     X, y = sample_data
 69 |     estimator.fit(X, y)
 70 | 
 71 |     meta_estimator = NearestNeighborsMetaEstimator(estimator)
 72 | 
 73 |     # Fit the meta-estimator
 74 |     meta_estimator.fit(X, y)
 75 | 
 76 |     # Test the fitted estimator attribute
 77 |     assert hasattr(meta_estimator, "estimator_")
 78 | 
 79 |     # Test the nearest neighbors estimator
 80 |     assert isinstance(meta_estimator.neigh_est_, NearestNeighbors)
 81 | 
 82 |     # Test the kneighbors method
 83 |     neigh_dist, neigh_ind = meta_estimator.kneighbors()
 84 |     assert neigh_dist.shape == (X.shape[0], meta_estimator.n_neighbors)
 85 |     assert neigh_ind.shape == (X.shape[0], meta_estimator.n_neighbors)
 86 | 
 87 |     # Test the radius_neighbors method
 88 |     neigh_dist, neigh_ind = meta_estimator.radius_neighbors(radius=0.5)
 89 |     assert neigh_dist.shape == (X.shape[0],)
 90 |     assert neigh_ind.shape == (X.shape[0],)
 91 | 
 92 | 
 93 | @parametrize_with_checks(
 94 |     [
 95 |         NearestNeighborsMetaEstimator(DecisionTreeClassifier(random_state=0)),
 96 |     ]
 97 | )
 98 | def test_sklearn_compatible_transformer(estimator, check):
 99 |     check(estimator)
100 | 


--------------------------------------------------------------------------------
/examples/treeple/treeple_tutorial_1_1d_HD.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==============================
  3 | Calculating Hellinger Distance
  4 | ==============================
  5 | """
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import seaborn as sns
 10 | 
 11 | from treeple.datasets import make_trunk_classification
 12 | from treeple.ensemble import HonestForestClassifier
 13 | from treeple.stats import build_oob_forest
 14 | 
 15 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 16 | PALETTE = sns.color_palette("Set1")
 17 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
 18 | sns.set_style("white", {"axes.edgecolor": "#dddddd"})
 19 | 
 20 | # %%
 21 | # Hellinger Distance
 22 | # ------------------
 23 | #
 24 | # Hellinger distance quantifies the similarity between the two posterior
 25 | # probability distributions (class zero and class one).
 26 | #
 27 | # .. math:: H(\eta(X), 1-\eta(X)) = \frac{1}{\sqrt{2}} \; \bigl\|\sqrt{\eta(X)} - \sqrt{1-\eta(X)} \bigr\|_2
 28 | #
 29 | # With a binary class simulation as an example, this tutorial will show
 30 | # how to use ``treeple`` to calculate the statistic.
 31 | 
 32 | # %%
 33 | # Create a simulation with two gaussians
 34 | # --------------------------------------
 35 | 
 36 | 
 37 | # create a binary class simulation with two gaussians
 38 | # 500 samples for each class, class zero is standard
 39 | # gaussian, and class one has a mean at one
 40 | X, y = make_trunk_classification(
 41 |     n_samples=1000,
 42 |     n_dim=1,
 43 |     mu_0=0,
 44 |     mu_1=1,
 45 |     n_informative=1,
 46 |     seed=1,
 47 | )
 48 | 
 49 | 
 50 | fig, ax = plt.subplots(figsize=(6, 6))
 51 | fig.tight_layout()
 52 | ax.tick_params(labelsize=15)
 53 | 
 54 | # histogram plot the samples
 55 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
 56 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
 57 | ax.set_xlabel("Variable One", fontsize=15)
 58 | ax.set_ylabel("Likelihood", fontsize=15)
 59 | plt.legend(frameon=False, fontsize=15)
 60 | plt.show()
 61 | 
 62 | # %%
 63 | # Fit the model
 64 | # -------------
 65 | 
 66 | 
 67 | # initialize the forest with 100 trees
 68 | est = HonestForestClassifier(
 69 |     n_estimators=100,
 70 |     max_samples=1.6,
 71 |     max_features=0.3,
 72 |     bootstrap=True,
 73 |     stratify=True,
 74 |     random_state=1,
 75 | )
 76 | 
 77 | # fit the model and obtain the tree posteriors
 78 | _, observe_proba = build_oob_forest(est, X, y)
 79 | 
 80 | # generate forest posteriors for the two classes
 81 | observe_proba = np.nanmean(observe_proba, axis=0)
 82 | 
 83 | 
 84 | fig, ax = plt.subplots(figsize=(6, 6))
 85 | fig.tight_layout()
 86 | ax.tick_params(labelsize=15)
 87 | 
 88 | # histogram plot the posterior probabilities for class one
 89 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
 90 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
 91 | ax.set_ylabel("# of Samples", fontsize=15)
 92 | ax.set_xlabel("Class One Posterior", fontsize=15)
 93 | plt.legend(frameon=False, fontsize=15)
 94 | plt.show()
 95 | 
 96 | # %%
 97 | # Calculate the statistic
 98 | # -----------------------
 99 | 
100 | 
101 | def Calculate_hd(y_pred_proba) -> float:
102 |     return np.sqrt(
103 |         np.sum((np.sqrt(y_pred_proba[:, 1]) - np.sqrt(y_pred_proba[:, 0])) ** 2)
104 |     ) / np.sqrt(2)
105 | 
106 | 
107 | hd = Calculate_hd(observe_proba)
108 | print("Hellinger distance =", round(hd, 2))
109 | 


--------------------------------------------------------------------------------
/examples/sklearn_vs_treeple/plot_iris_dtc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | =======================================================================
 3 | Plot the decision surface of decision trees trained on the iris dataset
 4 | =======================================================================
 5 | 
 6 | Plot the decision surface of a decision tree and oblique decision tree
 7 | trained on pairs of features of the iris dataset.
 8 | 
 9 | See :ref:`decision tree <tree>` for more information on the estimators.
10 | 
11 | For each pair of iris features, the decision tree learns axis-aligned decision
12 | boundaries made of combinations of simple thresholding rules inferred from
13 | the training samples. The oblique decision tree learns oblique decision boundaries
14 | made from linear combinations of the features in the training samples and then
15 | the same thresholding rule as regular decision trees.
16 | 
17 | We also show the tree structure of a model built on all of the features.
18 | """
19 | 
20 | import matplotlib.pyplot as plt
21 | import numpy as np
22 | from sklearn.datasets import load_iris
23 | from sklearn.inspection import DecisionBoundaryDisplay
24 | 
25 | from treeple._lib.sklearn.tree import DecisionTreeClassifier, plot_tree
26 | from treeple.tree import ObliqueDecisionTreeClassifier
27 | 
28 | # %%
29 | # First load the copy of the Iris dataset shipped with scikit-learn:
30 | iris = load_iris()
31 | 
32 | # Parameters
33 | n_classes = 3
34 | plot_colors = "ryb"
35 | plot_step = 0.02
36 | 
37 | clf_labels = ["Axis-aligned", "Oblique"]
38 | random_state = 123456
39 | 
40 | clfs = [
41 |     DecisionTreeClassifier(random_state=random_state),
42 |     ObliqueDecisionTreeClassifier(random_state=random_state),
43 | ]
44 | 
45 | for clf, clf_label in zip(clfs, clf_labels):
46 |     fig, axes = plt.subplots(2, 3)
47 |     axes = axes.flatten()
48 | 
49 |     for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
50 |         # We only take the two corresponding features
51 |         X = iris.data[:, pair]
52 |         y = iris.target
53 | 
54 |         # Train
55 |         clf.fit(X, y)
56 | 
57 |         # Plot the decision boundary
58 |         ax = axes[pairidx]
59 |         plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
60 |         DecisionBoundaryDisplay.from_estimator(
61 |             clf,
62 |             X,
63 |             cmap=plt.cm.RdYlBu,
64 |             response_method="predict",
65 |             ax=ax,
66 |             xlabel=iris.feature_names[pair[0]],
67 |             ylabel=iris.feature_names[pair[1]],
68 |         )
69 | 
70 |         # Plot the training points
71 |         for i, color in zip(range(n_classes), plot_colors):
72 |             idx = np.where(y == i)
73 |             ax.scatter(
74 |                 X[idx, 0],
75 |                 X[idx, 1],
76 |                 c=color,
77 |                 label=iris.target_names[i],
78 |                 cmap=plt.cm.RdYlBu,
79 |                 edgecolor="black",
80 |                 s=15,
81 |             )
82 | 
83 |     fig.suptitle(f"Decision surface of {clf_label} decision trees trained on pairs of features")
84 |     plt.legend(loc="lower right", borderpad=0, handletextpad=0)
85 |     _ = plt.axis("tight")
86 |     plt.show()
87 | 
88 | # %%
89 | # Display the structure of a single decision tree trained on all the features
90 | # together.
91 | 
92 | for clf, clf_label in zip(clfs, clf_labels):
93 |     plt.figure()
94 |     clf.fit(iris.data, iris.target)
95 |     plot_tree(clf, filled=True)
96 |     plt.title(f"{clf_label} decision tree trained on all the iris features")
97 |     plt.show()
98 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_checks.yml:
--------------------------------------------------------------------------------
 1 | name: "PR Checks"
 2 | 
 3 | concurrency:
 4 |   group: ${{ github.workflow }}-${{ github.ref }}
 5 |   cancel-in-progress: true
 6 | 
 7 | on:
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |     paths:
12 |       - "treeple/**"
13 | 
14 | jobs:
15 |   changelog:
16 |     name: CHANGELOG
17 |     runs-on: ubuntu-latest
18 |     # if: github.event_name == 'pull_request'
19 |     if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }}
20 |     steps:
21 |       - name: Get PR number and milestone
22 |         run: |
23 |           echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
24 |           echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV
25 |       - uses: actions/checkout@v4
26 |         with:
27 |           fetch-depth: "0"
28 |       - name: Check that CHANGELOG has been updated
29 |         run: |
30 |           # If this step fails, this means you haven't updated the CHANGELOG.md
31 |           # file with notes on your contribution.
32 |           # git diff --name-only $(git merge-base origin/main HEAD) | grep '^CHANGELOG.md$' && echo "Thanks for helping keep our CHANGELOG up-to-date!"
33 |           set -xe
34 |           changed_files=$(git diff --name-only origin/main)
35 |           # Changelog should be updated only if tests have been modified
36 |           if [[ ! "$changed_files" =~ tests ]]
37 |           then
38 |             exit 0
39 |           fi
40 |           all_changelogs=$(cat ./doc/whats_new/v*.rst)
41 |           if [[ "$all_changelogs" =~ :pr:\`#$PR_NUMBER\` ]]
42 |           then
43 |             echo "Changelog has been updated."
44 |             # If the pull request is milestoned check the correspondent changelog
45 |             if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst
46 |             then
47 |               expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst)
48 |               if [[ "$expected_changelog" =~ :pr:\`#$PR_NUMBER\` ]]
49 |               then
50 |                 echo "Changelog and milestone correspond."
51 |               else
52 |                 echo "Changelog and milestone do not correspond."
53 |                 echo "If you see this error make sure that the tagged milestone for the PR"
54 |                 echo "and the edited changelog filename properly match."
55 |                 exit 1
56 |               fi
57 |             fi
58 |           else
59 |             echo "A Changelog entry is missing for :pr:\`#$PR_NUMBER\`"
60 |             echo ""
61 |             echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'"
62 |             echo "to document your change assuming that the PR will be merged"
63 |             echo "in time for the next release of treeple."
64 |             echo ""
65 |             echo "Look at other entries in that file for inspiration and please"
66 |             echo "reference this pull request using the ':pr:' directive and"
67 |             echo "credit yourself (and other contributors if applicable) with"
68 |             echo "the ':user:' directive."
69 |             echo ""
70 |             echo "If you see this error and there is already a changelog entry,"
71 |             echo "check that the PR number is correct."
72 |             echo ""
73 |             echo "If you believe that this PR does not warrant a changelog"
74 |             echo "entry, say so in a comment so that a maintainer will label"
75 |             echo "the PR with 'No Changelog Needed' to bypass this check."
76 |             exit 1
77 |           fi
78 | 


--------------------------------------------------------------------------------
/benchmarks_nonasv/bench_plot_urf.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from time import time
 3 | 
 4 | import numpy as np
 5 | from numpy import random as nr
 6 | 
 7 | from treeple import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest
 8 | 
 9 | 
10 | def compute_bench(samples_range, features_range):
11 |     it = 0
12 |     results = defaultdict(lambda: [])
13 | 
14 |     est_params = {"min_samples_split": 5, "criterion": "fastbic", "n_jobs": None}
15 | 
16 |     max_it = len(samples_range) * len(features_range)
17 |     for n_samples in samples_range:
18 |         for n_features in features_range:
19 |             it += 1
20 | 
21 |             print("==============================")
22 |             print("Iteration %03d of %03d" % (it, max_it))
23 |             print("==============================")
24 |             print()
25 |             print(f"n_samples: {n_samples} and n_features: {n_features}")
26 |             data = nr.randint(-50, 51, (n_samples, n_features))
27 | 
28 |             print("Unsupervised RF")
29 |             tstart = time()
30 |             est = UnsupervisedRandomForest(**est_params).fit(data)
31 | 
32 |             delta = time() - tstart
33 |             max_depth = max(tree.get_depth() for tree in est.estimators_)
34 |             print("Speed: %0.3fs" % delta)
35 |             print("Max depth: %d" % max_depth)
36 |             print()
37 | 
38 |             results["unsup_rf_speed"].append(delta)
39 |             results["unsup_rf_depth"].append(max_depth)
40 | 
41 |             print("Unsupervised Oblique RF")
42 |             # let's prepare the data in small chunks
43 |             est = UnsupervisedObliqueRandomForest(**est_params)
44 |             tstart = time()
45 |             est.fit(data)
46 |             delta = time() - tstart
47 |             max_depth = max(tree.get_depth() for tree in est.estimators_)
48 |             print("Speed: %0.3fs" % delta)
49 |             print("Max depth: %d" % max_depth)
50 |             print()
51 |             print()
52 | 
53 |             results["unsup_obliquerf_speed"].append(delta)
54 |             results["unsup_obliquerf_depth"].append(max_depth)
55 | 
56 |     return results
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     import matplotlib.pyplot as plt
61 |     from mpl_toolkits.mplot3d import axes3d  # noqa register the 3d projection
62 | 
63 |     samples_range = np.linspace(50, 150, 5).astype(int)
64 |     features_range = np.linspace(150, 50000, 5).astype(int)
65 |     chunks = np.linspace(500, 10000, 15).astype(int)
66 | 
67 |     results = compute_bench(samples_range, features_range)
68 | 
69 |     max_time = max([max(i) for i in [t for (label, t) in results.items() if "speed" in label]])
70 |     max_inertia = max(
71 |         [max(i) for i in [t for (label, t) in results.items() if "speed" not in label]]
72 |     )
73 | 
74 |     fig = plt.figure("scikit-learn Unsupervised (Oblique and Axis) RF benchmark results")
75 |     for c, (label, timings) in zip("brcy", sorted(results.items())):
76 |         if "speed" in label:
77 |             ax = fig.add_subplot(2, 1, 1, projection="3d")
78 |             ax.set_zlim3d(0.0, max_time * 1.1)
79 |         else:
80 |             ax = fig.add_subplot(2, 1, 2, projection="3d")
81 |             ax.set_zlim3d(0.0, max_inertia * 1.1)
82 | 
83 |         X, Y = np.meshgrid(samples_range, features_range)
84 |         Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0])
85 |         ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5)
86 |         ax.set_title(f"{label}")
87 |         ax.set_xlabel("n_samples")
88 |         ax.set_ylabel("n_features")
89 | 
90 |     plt.show()
91 | 


--------------------------------------------------------------------------------
/examples/treeple/treeple_tutorial_1_1b_MI.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==============
  3 | Calculating MI
  4 | ==============
  5 | """
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import seaborn as sns
 10 | from scipy.stats import entropy
 11 | 
 12 | from treeple.datasets import make_trunk_classification
 13 | from treeple.ensemble import HonestForestClassifier
 14 | from treeple.stats import build_oob_forest
 15 | 
 16 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 17 | PALETTE = sns.color_palette("Set1")
 18 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
 19 | sns.set_style("white", {"axes.edgecolor": "#dddddd"})
 20 | # %%
 21 | # MI
 22 | # --
 23 | #
 24 | # Mutual Information (*MI*) measures the mutual dependence between *X* and
 25 | # *Y*. It can be calculated by the difference between the class entropy
 26 | # (``H(Y)``) and the conditional entropy (``H(Y | X)``):
 27 | #
 28 | # .. math:: I(X; Y) = H(Y) - H(Y\mid X)
 29 | #
 30 | # With a binary class simulation as an example, this tutorial will show
 31 | # how to use ``treeple`` to calculate the statistic.
 32 | 
 33 | # %%
 34 | # Create a simulation with two gaussians
 35 | # --------------------------------------
 36 | 
 37 | 
 38 | # create a binary class simulation with two gaussians
 39 | # 500 samples for each class, class zero is standard
 40 | # gaussian, and class one has a mean at one
 41 | X, y = make_trunk_classification(
 42 |     n_samples=1000,
 43 |     n_dim=1,
 44 |     mu_0=0,
 45 |     mu_1=1,
 46 |     n_informative=1,
 47 |     seed=1,
 48 | )
 49 | 
 50 | 
 51 | fig, ax = plt.subplots(figsize=(6, 6))
 52 | fig.tight_layout()
 53 | ax.tick_params(labelsize=15)
 54 | 
 55 | # histogram plot the samples
 56 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
 57 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
 58 | ax.set_xlabel("Variable One", fontsize=15)
 59 | ax.set_ylabel("Likelihood", fontsize=15)
 60 | plt.legend(frameon=False, fontsize=15)
 61 | plt.show()
 62 | 
 63 | 
 64 | # %%
 65 | # Fit the model
 66 | # -------------
 67 | 
 68 | 
 69 | # initialize the forest with 100 trees
 70 | est = HonestForestClassifier(
 71 |     n_estimators=100,
 72 |     max_samples=1.6,
 73 |     max_features=0.3,
 74 |     bootstrap=True,
 75 |     stratify=True,
 76 |     random_state=1,
 77 | )
 78 | 
 79 | # fit the model and obtain the tree posteriors
 80 | _, observe_proba = build_oob_forest(est, X, y)
 81 | 
 82 | # generate forest posteriors for the two classes
 83 | observe_proba = np.nanmean(observe_proba, axis=0)
 84 | 
 85 | 
 86 | fig, ax = plt.subplots(figsize=(6, 6))
 87 | fig.tight_layout()
 88 | ax.tick_params(labelsize=15)
 89 | 
 90 | # histogram plot the posterior probabilities for class one
 91 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
 92 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
 93 | ax.set_ylabel("# of Samples", fontsize=15)
 94 | ax.set_xlabel("Class One Posterior", fontsize=15)
 95 | plt.legend(frameon=False, fontsize=15)
 96 | plt.show()
 97 | 
 98 | 
 99 | # %%
100 | # Calculate the statistic
101 | # -----------------------
102 | def Calculate_MI(y_true, y_pred_proba):
103 |     # calculate the conditional entropy
104 |     H_YX = np.mean(entropy(y_pred_proba, base=np.exp(1), axis=1))
105 | 
106 |     # empirical count of each class (n_classes)
107 |     _, counts = np.unique(y_true, return_counts=True)
108 |     # calculate the entropy of labels
109 |     H_Y = entropy(counts, base=np.exp(1))
110 |     return H_Y - H_YX
111 | 
112 | 
113 | mi = Calculate_MI(y, observe_proba)
114 | print("MI =", round(mi, 2))
115 | 


--------------------------------------------------------------------------------
/treeple/_lib/meson.build:
--------------------------------------------------------------------------------
  1 | fs = import('fs')
  2 | if not fs.exists('sklearn')
  3 |   error('Missing the `sklearn` fork submodule! Run `git submodule update --init` to fix this.')
  4 | endif
  5 | 
  6 | # install tree/ submodule
  7 | tree_extension_metadata = {
  8 |   '_tree':
  9 |     {'sources': ['./sklearn/tree/' + '_tree.pyx'],
 10 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 11 |   '_partitioner':
 12 |     {'sources': ['./sklearn/tree/' + '_partitioner.pyx'],
 13 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 14 |   '_splitter':
 15 |     {'sources': ['./sklearn/tree/' + '_splitter.pyx'],
 16 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 17 |   '_criterion':
 18 |     {'sources': ['./sklearn/tree/' + '_criterion.pyx'],
 19 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 20 |   '_utils':
 21 |     {'sources': ['./sklearn/tree/' + '_utils.pyx'],
 22 |      'override_options': ['cython_language=cpp', 'optimization=3']},
 23 | }
 24 | 
 25 | 
 26 | foreach ext_name, ext_dict : tree_extension_metadata
 27 |   py.extension_module(
 28 |     ext_name,
 29 |     ext_dict.get('sources'),
 30 |     dependencies: [np_dep],
 31 |     override_options : ext_dict.get('override_options', []),
 32 |     cython_args: cython_c_args,
 33 |     subdir: 'treeple/_lib/sklearn/tree/',
 34 |     install: true
 35 |   )
 36 | endforeach
 37 | 
 38 | python_sources = [
 39 |   './sklearn/tree/__init__.py',
 40 |   './sklearn/tree/_classes.py',
 41 |   './sklearn/tree/_export.py',
 42 |   './sklearn/tree/_reingold_tilford.py',
 43 | ]
 44 | 
 45 | py.install_sources(
 46 |   python_sources,
 47 |   subdir: 'treeple/_lib/sklearn/tree'   # Folder relative to site-packages to install to
 48 | )
 49 | 
 50 | # install ensemble/ submodule
 51 | python_sources = [
 52 |   '_forest.py',
 53 | ]
 54 | foreach py_source: python_sources
 55 |   py.install_sources(
 56 |     './sklearn/ensemble/' + py_source,
 57 |     subdir: 'treeple/_lib/sklearn/ensemble'
 58 |   )
 59 | endforeach
 60 | 
 61 | # TODO: Can remove if included in scikit-learn eventually
 62 | # install tree/ submodule
 63 | extensions = [
 64 |   '_quad_tree',
 65 | ]
 66 | 
 67 | foreach ext: extensions
 68 |   py.extension_module(
 69 |     ext,
 70 |     ['./sklearn/neighbors/' + ext + '.pyx'],
 71 |     c_args: c_args,
 72 |     dependencies: [np_dep],
 73 |     cython_args: cython_c_args,
 74 |     override_options : ['optimization=3', 'cython_language=cpp'],
 75 |     install: true,
 76 |     subdir: 'treeple/_lib/sklearn/neighbors/',
 77 |   )
 78 | endforeach
 79 | 
 80 | # install tree/ submodule
 81 | extensions = [
 82 |   '_typedefs',
 83 |   '_random',
 84 | ]
 85 | 
 86 | foreach ext: extensions
 87 |   py.extension_module(ext,
 88 |     ['./sklearn/utils/' + ext + '.pyx'],
 89 |     c_args: c_args,
 90 |     dependencies: [np_dep],
 91 |     cython_args: cython_c_args,
 92 |     override_options : ['optimization=3', 'cython_language=cpp'],
 93 |     install: true,
 94 |     subdir: 'treeple/_lib/sklearn/utils/',
 95 |   )
 96 | endforeach
 97 | 
 98 | 
 99 | # python_sources = [
100 | #   '__init__.py',
101 | # ]
102 | 
103 | # py.install_sources(
104 | #   python_sources,
105 | #   subdir: 'treeple/_lib'   # Folder relative to site-packages to install to
106 | # )
107 | 
108 | # tempita = files('./sklearn/_build_utils/tempita.py')
109 | 
110 | # # Copy all the .py files to the install dir, rather than using
111 | # # py.install_sources and needing to list them explicitely one by one
112 | # # install_subdir('sklearn', install_dir: py.get_install_dir())
113 | # install_subdir('sklearn', install_dir: join_paths(py.get_install_dir(), 'treeple/_lib'))
114 | 
115 | # subdir('sklearn')
116 | 


--------------------------------------------------------------------------------
/treeple/tree/unsupervised/_unsup_oblique_splitter.pxd:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from libcpp.vector cimport vector
 4 | 
 5 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
 6 | from ..._lib.sklearn.tree._tree cimport ParentInfo
 7 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t
 8 | from .._oblique_splitter cimport ObliqueSplitRecord
 9 | from ._unsup_splitter cimport UnsupervisedSplitter
10 | 
11 | # cdef struct ObliqueSplitRecord:
12 | #     # Data to track sample split
13 | #     intp_t feature              # Which feature to split on.
14 | #     intp_t pos                  # Split samples array at the given position,
15 | #     #                           # i.e. count of samples below threshold for feature.
16 | #     #                           # pos is >= end if the node is a leaf.
17 | #     float64_t threshold            # Threshold to split at.
18 | #     float64_t improvement          # Impurity improvement given parent node.
19 | #     float64_t impurity_left        # Impurity of the left split.
20 | #     float64_t impurity_right       # Impurity of the right split.
21 | #     intp_t n_constant_features   # Number of constant features in the split.
22 | 
23 | #     vector[float32_t]* proj_vec_weights   # weights of the vector (max_features,)
24 | #     vector[intp_t]* proj_vec_indices    # indices of the features (max_features,)
25 | 
26 | 
27 | cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter):
28 |     """
29 |     Notable changes wrt scikit-learn:
30 |     1. `weighted_n_node_samples` is used as a stopping criterion and just used to
31 |     keep count of the "number of samples (weighted)". All samples have a default weight
32 |     of '1'.
33 |     2. `X` array instead of `y` array is stored as the criterions are computed over the X
34 |     array.
35 |     """
36 | 
37 |     # Oblique Splitting extra parameters
38 |     cdef public float64_t feature_combinations             # Number of features to combine
39 |     cdef intp_t n_non_zeros                             # Number of non-zero features
40 |     cdef vector[vector[float32_t]] proj_mat_weights       # nonzero weights of sparse proj_mat matrix
41 |     cdef vector[vector[intp_t]] proj_mat_indices        # nonzero indices of sparse proj_mat matrix
42 |     cdef intp_t[::1] indices_to_sample                  # an array of indices to sample of size mtry X n_features
43 | 
44 |     # All oblique splitters (i.e. non-axis aligned splitters) require a
45 |     # function to sample a projection matrix that is applied to the feature matrix
46 |     # to quickly obtain the sampled projections for candidate splits.
47 |     cdef void sample_proj_mat(self,
48 |                               vector[vector[float32_t]]& proj_mat_weights,
49 |                               vector[vector[intp_t]]& proj_mat_indices) noexcept nogil
50 | 
51 |     # Redefined here since the new logic requires calling sample_proj_mat
52 |     cdef int node_reset(self, intp_t start, intp_t end,
53 |                         float64_t* weighted_n_node_samples) except -1 nogil
54 | 
55 |     cdef int node_split(
56 |         self,
57 |         ParentInfo* parent,
58 |         SplitRecord* split,
59 |     ) except -1 nogil
60 |     cdef int init(
61 |         self,
62 |         const float32_t[:, :] X,
63 |         const float64_t[:] sample_weight
64 |     ) except -1
65 |     cdef intp_t pointer_size(self) noexcept nogil
66 | 
67 |     cdef void compute_features_over_samples(
68 |         self,
69 |         intp_t start,
70 |         intp_t end,
71 |         const intp_t[:] samples,
72 |         float32_t[:] feature_values,
73 |         vector[float32_t]* proj_vec_weights,  # weights of the vector (max_features,)
74 |         vector[intp_t]* proj_vec_indices    # indices of the features (max_features,)
75 |     ) noexcept nogil
76 | 


--------------------------------------------------------------------------------
/examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ==========================================================================================
 3 | Plot oblique forest and axis-aligned random forest predictions on sparse parity simulation
 4 | ==========================================================================================
 5 | A performance comparison between oblique forest and standard axis-
 6 | aligned random forest using sparse parity simulation dataset.
 7 | Sparse parity is a variation of the noisy parity problem,
 8 | which itself is a multivariate generalization of the noisy XOR problem.
 9 | This is a binary classification task in high dimensions. The simulation
10 | will generate uniformly distributed `n_samples` number of sample points
11 | in the range of -1 and +1 with `p` number of features. `p*` is a
12 | parameter used to limit features that carry information about the class.
13 | The informative binary label is then defined as 1 if there are odd number
14 | of the sum of data `X` across first `p*` features that are greater than 0,
15 | otherwise the label is defined as 0. The simulation is further detailed
16 | in this [publication](https://epubs.siam.org/doi/epdf/10.1137/1.9781611974973.56).
17 | """
18 | 
19 | from datetime import datetime
20 | 
21 | import matplotlib.pyplot as plt
22 | import numpy as np
23 | import pandas as pd
24 | import seaborn as sns
25 | from sklearn.ensemble import RandomForestClassifier
26 | from sklearn.model_selection import RepeatedKFold, cross_validate
27 | 
28 | from treeple import ObliqueRandomForestClassifier
29 | 
30 | random_state = 123456
31 | t0 = datetime.now()
32 | 
33 | 
34 | def sparse_parity(n_samples, p=20, p_star=3, random_seed=None, **kwargs):
35 |     if random_seed:
36 |         np.random.seed(random_seed)
37 | 
38 |     X = np.random.uniform(-1, 1, (n_samples, p))
39 |     y = np.zeros(n_samples)
40 | 
41 |     for i in range(0, n_samples):
42 |         y[i] = sum(X[i, :p_star] > 0) % 2
43 | 
44 |     return X, y
45 | 
46 | 
47 | def get_scores(X, y, n_cv=5, n_repeats=1, random_state=1, kwargs=None):
48 |     clfs = [
49 |         RandomForestClassifier(**kwargs[0], random_state=random_state),
50 |         ObliqueRandomForestClassifier(**kwargs[1], random_state=random_state),
51 |     ]
52 | 
53 |     tmp = []
54 | 
55 |     for i, clf in enumerate(clfs):
56 |         cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=random_state)
57 |         test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")
58 | 
59 |         tmp.append([["RF", "OF"][i], test_score["test_score"], test_score["test_score"].mean()])
60 | 
61 |     df = pd.DataFrame(tmp, columns=["model", "score", "mean"])
62 |     df = df.explode("score")
63 |     df["score"] = df["score"].astype(float)
64 |     df.reset_index(inplace=True, drop=True)
65 | 
66 |     return df
67 | 
68 | 
69 | # Grid searched hyper-parameters
70 | params = [
71 |     {"max_features": None, "n_estimators": 100, "max_depth": None},
72 |     {"max_features": 40, "n_estimators": 100, "max_depth": 20},
73 | ]
74 | 
75 | X, y = sparse_parity(n_samples=1000, random_seed=random_state)
76 | 
77 | df = get_scores(X=X, y=y, n_cv=3, n_repeats=1, random_state=random_state, kwargs=params)
78 | t_d = (datetime.now() - t0).seconds
79 | print(f"It took {t_d} seconds to run the script")
80 | 
81 | # Draw a comparison plot
82 | fig, ax = plt.subplots(1, 1, figsize=(6, 6))
83 | 
84 | sns.stripplot(data=df, x="model", y="score", ax=ax, dodge=True)
85 | sns.boxplot(data=df, x="model", y="score", ax=ax, color="white")
86 | ax.set_title("Sparse Parity")
87 | 
88 | rf = df.query('model=="RF"')["mean"].iloc[0]
89 | rff = f"RF (Mean Test Score: {round(rf,3)})"
90 | 
91 | of = df.query('model=="OF"')["mean"].iloc[0]
92 | off = f"OF (Mean Test Score: {round(of,3)})"
93 | 
94 | ax.legend([rff, off], loc=4)
95 | 
96 | plt.savefig(f"plot_sim_{t_d}s.jpg")
97 | plt.show()
98 | 


--------------------------------------------------------------------------------
/treeple/__init__.py:
--------------------------------------------------------------------------------
 1 | """Scikit manifold oblique random forests."""
 2 | 
 3 | import logging
 4 | import os
 5 | import sys
 6 | 
 7 | __version__ = "0.10.3"
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
12 | # simultaneously. This can happen for instance when calling BLAS inside a
13 | # prange. Setting the following environment variable allows multiple OpenMP
14 | # libraries to be loaded. It should not degrade performances since we manually
15 | # take care of potential over-subcription performance issues, in sections of
16 | # the code where nested OpenMP loops can happen, by dynamically reconfiguring
17 | # the inner OpenMP runtime to temporarily disable it while under the scope of
18 | # the outer OpenMP parallel section.
19 | os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
20 | 
21 | # Workaround issue discovered in intel-openmp 2019.5:
22 | # https://github.com/ContinuumIO/anaconda-issues/issues/11294
23 | os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
24 | 
25 | 
26 | try:
27 |     # This variable is injected in the __builtins__ by the build
28 |     # process. It is used to enable importing subpackages of sklearn when
29 |     # the binaries are not built
30 |     __treeple_SETUP__  # type: ignore
31 | except NameError:
32 |     __treeple_SETUP__ = False
33 | 
34 | if __treeple_SETUP__:
35 |     sys.stderr.write("Running from treeple source directory.\n")
36 |     sys.stderr.write("Partial import of treeple during the build process.\n")
37 |     # We are not importing the rest of treeple during the build
38 |     # process, as it may not be compiled yet
39 | else:
40 |     try:
41 |         from . import _lib, tree, ensemble, experimental, stats
42 |         from ._lib.sklearn.ensemble._forest import (
43 |             RandomForestClassifier,
44 |             RandomForestRegressor,
45 |             ExtraTreesClassifier,
46 |             ExtraTreesRegressor,
47 |         )
48 |         from .neighbors import NearestNeighborsMetaEstimator
49 |         from .ensemble import ExtendedIsolationForest, MultiViewRandomForestClassifier
50 |         from .ensemble._unsupervised_forest import (
51 |             UnsupervisedRandomForest,
52 |             UnsupervisedObliqueRandomForest,
53 |         )
54 |         from .ensemble._supervised_forest import (
55 |             ExtraObliqueRandomForestClassifier,
56 |             ExtraObliqueRandomForestRegressor,
57 |             ObliqueRandomForestClassifier,
58 |             ObliqueRandomForestRegressor,
59 |             PatchObliqueRandomForestClassifier,
60 |             PatchObliqueRandomForestRegressor,
61 |         )
62 |         from .ensemble._honest_forest import HonestForestClassifier
63 |     except ImportError as e:
64 |         print(e.msg)
65 |         msg = """Error importing treeple: you cannot import treeple while
66 |         being in treeple source directory; please exit the treeple source
67 |         tree first and relaunch your Python interpreter."""
68 |         raise Exception(e)
69 |         # raise ImportError(msg) from e
70 | 
71 |     __all__ = [
72 |         "_lib",
73 |         "tree",
74 |         "experimental",
75 |         "ensemble",
76 |         "stats",
77 |         "ExtraObliqueRandomForestClassifier",
78 |         "ExtraObliqueRandomForestRegressor",
79 |         "NearestNeighborsMetaEstimator",
80 |         "ObliqueRandomForestClassifier",
81 |         "ObliqueRandomForestRegressor",
82 |         "PatchObliqueRandomForestClassifier",
83 |         "PatchObliqueRandomForestRegressor",
84 |         "UnsupervisedRandomForest",
85 |         "UnsupervisedObliqueRandomForest",
86 |         "HonestForestClassifier",
87 |         "RandomForestClassifier",
88 |         "RandomForestRegressor",
89 |         "ExtraTreesClassifier",
90 |         "ExtraTreesRegressor",
91 |         "ExtendedIsolationForest",
92 |         "MultiViewRandomForestClassifier",
93 |     ]
94 | 


--------------------------------------------------------------------------------
/examples/quantile_predictions/plot_quantile_vs_standard_oblique_forest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ==============================================================
 3 | Quantile regression with oblique regression forest
 4 | ==============================================================
 5 | 
 6 | An example to generate quantile predictions using an oblique random forest
 7 | instance on a synthetic, right-skewed dataset.
 8 | 
 9 | This example was heavily inspired by `quantile-forest <https://github.com/zillow/quantile-forest>`_
10 | package. See their package `here <https://zillow.github.io/quantile-forest/>`_.
11 | """
12 | 
13 | from collections import defaultdict
14 | 
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | import scipy as sp
18 | from sklearn.model_selection import train_test_split
19 | from sklearn.utils.validation import check_random_state
20 | 
21 | from treeple.ensemble import ObliqueRandomForestRegressor
22 | 
23 | rng = check_random_state(0)
24 | 
25 | # %%
26 | # Generate the data
27 | # -----------------
28 | # We use a synthetic dataset with 2 features and 5000 samples. The target is
29 | # generated from a skewed normal distribution. (The mean of the distribution
30 | # is to the right of the median.)
31 | 
32 | n_samples = 5000
33 | a, loc, scale = 5, -1, 1
34 | skewnorm_rv = sp.stats.skewnorm(a, loc, scale)
35 | skewnorm_rv.random_state = rng
36 | y = skewnorm_rv.rvs(n_samples)
37 | X = rng.randn(n_samples, 2) * y.reshape(-1, 1)
38 | 
39 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
40 | 
41 | regr_orf = ObliqueRandomForestRegressor(n_estimators=10, random_state=0)
42 | 
43 | regr_orf.fit(X_train, y_train)
44 | 
45 | y_pred_orf = regr_orf.predict(X_test)
46 | # %%
47 | # Generate Quantile Predictions
48 | # -----------------------------
49 | # The idea is for each prediction, the training samples that fell into the same leaf nodes
50 | # are collected then used to generate the quantile statistics for the desired prediction.
51 | 
52 | # Get the leaf-nodes the training samples fall into
53 | leaf_ids = regr_orf.apply(X_train)
54 | # create a list of dictionary that maps node to samples that fell into it
55 | # for each tree
56 | node_to_indices = []
57 | for tree in range(leaf_ids.shape[1]):
58 |     d = defaultdict(list)
59 |     for id, leaf in enumerate(leaf_ids[:, tree]):
60 |         d[leaf].append(id)
61 |     node_to_indices.append(d)
62 | # drop the X_test to the trained tree and
63 | # get the indices of leaf nodes that fall into it
64 | leaf_ids_test = regr_orf.apply(X_test)
65 | # for each samples, collect the indices of the samples that fell into
66 | # the same leaf node for each tree
67 | y_pred_quantile = []
68 | for sample in range(leaf_ids_test.shape[0]):
69 |     li = [
70 |         node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
71 |     ]
72 |     # merge the list of indices into one
73 |     idx = [item for sublist in li for item in sublist]
74 |     # get the y_train for each corresponding id
75 |     y_pred_quantile.append(y_train[idx])
76 | # get the quatile preditions for each predicted sample
77 | y_pred_quantile = [np.quantile(y_pred_quantile[i], 0.5) for i in range(len(y_pred_quantile))]
78 | 
79 | # %%
80 | # Plot the results
81 | # ----------------
82 | # The plot shows the distribution of the actual target values and the predicted median
83 | # (i.e. 0.5 quantile), and the mean prediction by the regular random forest regressor.
84 | # In this skewed dataset, the median prediction using the quantile method works better at
85 | # predicting the off-centered target distribution than the regular mean prediction.
86 | 
87 | colors = ["#c0c0c0", "#a6e5ff", "#e7a4f5"]
88 | names = ["Actual", "QRF (Median)", "ORF (Mean)"]
89 | plt.hist([y_test, y_pred_quantile, y_pred_orf], bins=50, color=colors, label=names)
90 | plt.xlabel("Actual and Predicted Target Values")
91 | plt.ylabel("Counts")
92 | plt.legend()
93 | plt.show()
94 | 


--------------------------------------------------------------------------------
/examples/quantile_predictions/plot_quantile_toy_example_with_RF.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ======================================================
  3 | Quantile prediction with Random Forest Regressor class
  4 | ======================================================
  5 | 
  6 | An example that demonstrates how to use the Random Forest to generate
  7 | quantile predictions such as conditional median and prediction intervals.
  8 | The example compares the predictions to a ground truth function used
  9 | to generate noisy samples.
 10 | 
 11 | This example was heavily inspired by `quantile-forest <https://github.com/zillow/quantile-forest>`_
 12 | package. See their package `here <https://zillow.github.io/quantile-forest/>`_.
 13 | """
 14 | 
 15 | from collections import defaultdict
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | import numpy as np
 19 | from sklearn.ensemble import RandomForestRegressor
 20 | from sklearn.model_selection import train_test_split
 21 | 
 22 | # %%
 23 | # Generate the data
 24 | 
 25 | 
 26 | def make_toy_dataset(n_samples, seed=0):
 27 |     rng = np.random.RandomState(seed)
 28 | 
 29 |     x = rng.uniform(0, 10, size=n_samples)
 30 |     f = x * np.sin(x)
 31 | 
 32 |     sigma = 0.25 + x / 10
 33 |     noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
 34 |     y = f + noise
 35 | 
 36 |     return np.atleast_2d(x).T, y
 37 | 
 38 | 
 39 | n_samples = 1000
 40 | X, y = make_toy_dataset(n_samples)
 41 | 
 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 43 | 
 44 | xx = np.atleast_2d(np.linspace(0, 10, n_samples)).T
 45 | 
 46 | 
 47 | # %%
 48 | # Fit the model to the training samples
 49 | # -------------------------------------
 50 | 
 51 | rf = RandomForestRegressor(max_depth=3, random_state=0)
 52 | rf.fit(X_train, y_train)
 53 | 
 54 | y_pred = rf.predict(xx)
 55 | 
 56 | # get the leaf nodes that each sample fell into
 57 | leaf_ids = rf.apply(X_train)
 58 | # create a list of dictionary that maps node to samples that fell into it
 59 | # for each tree
 60 | node_to_indices = []
 61 | for tree in range(leaf_ids.shape[1]):
 62 |     d = defaultdict(list)
 63 |     for id, leaf in enumerate(leaf_ids[:, tree]):
 64 |         d[leaf].append(id)
 65 |     node_to_indices.append(d)
 66 | # drop the X_test to the trained tree and
 67 | # get the indices of leaf nodes that fall into it
 68 | leaf_ids_test = rf.apply(xx)
 69 | # for each samples, collect the indices of the samples that fell into
 70 | # the same leaf node for each tree
 71 | y_pred_quatile = []
 72 | for sample in range(leaf_ids_test.shape[0]):
 73 |     li = [
 74 |         node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
 75 |     ]
 76 |     # merge the list of indices into one
 77 |     idx = [item for sublist in li for item in sublist]
 78 |     # get the y_train for each corresponding id
 79 |     y_pred_quatile.append(y_train[idx])
 80 | # get the quatile preditions for each predicted sample
 81 | y_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))]
 82 | y_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))]
 83 | y_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))]
 84 | 
 85 | # %%
 86 | # Plot the results
 87 | # ----------------
 88 | # Plot the conditional median and prediction intervals.
 89 | # The blue line is the predicted median and the shaded area indicates the 95% confidence interval
 90 | # of the prediction. The dots are the training data and the black line indicates the function that
 91 | # is used to generated those samples.
 92 | 
 93 | plt.plot(X_test, y_test, ".", c="#f2a619", label="Test Observations", ms=5)
 94 | plt.plot(xx, (xx * np.sin(xx)), c="black", label="$f(x) = x\,\sin(x)$", lw=2)
 95 | plt.plot(xx, y_pred_med, c="#006aff", label="Predicted Median", lw=3, ms=5)
 96 | plt.fill_between(
 97 |     xx.ravel(),
 98 |     y_pred_low,
 99 |     y_pred_upp,
100 |     color="#e0f2ff",
101 |     label="Predicted 95% Interval",
102 | )
103 | plt.xlabel("$x$")
104 | plt.ylabel("$f(x)$")
105 | plt.legend(loc="upper left")
106 | plt.show()
107 | 


--------------------------------------------------------------------------------
/doc/modules/ensemble.rst:
--------------------------------------------------------------------------------
 1 | .. _oblique_forests:
 2 | 
 3 | Oblique Random Forests
 4 | ----------------------
 5 | 
 6 | In oblique random forests (see :class:`~treeple.ObliqueRandomForestClassifier` and
 7 | `ObliqueRandomForestRegressor` classes), each tree in the ensemble is built
 8 | from a sample drawn with replacement (i.e., a bootstrap sample) from the
 9 | training set. The oblique random forest is the same as that of a random forest,
10 | except in how the splits are computed in each tree.
11 | 
12 | Similar to how random forests achieve a reduced variance by combining diverse trees,
13 | sometimes at the cost of a slight increase in bias, oblique random forests aim to do the same.
14 | They are motivated to construct even more diverse trees, thereby improving model generalization.
15 | In practice the variance reduction is often significant hence yielding an overall better model.
16 | 
17 | In contrast to the original publication :footcite:`breiman2001random`, the scikit-learn
18 | implementation allows the user to control the number of features to combine in computing
19 | candidate splits. This is done via the ``feature_combinations`` parameter. For
20 | more information and intuition, see
21 | :ref:`documentation on oblique decision trees <oblique_trees>`.
22 | 
23 | .. topic:: Examples:
24 | 
25 |  * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_random_forest.py`
26 |  * :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_oblique_axis_aligned_forests_sparse_parity.py`
27 | 
28 | .. topic:: References
29 | 
30 |  .. footbibliography::
31 | 
32 | .. _oblique_forest_feature_importance:
33 | 
34 | Feature importance evaluation
35 | -----------------------------
36 | 
37 | The relative rank (i.e. depth) of a feature used as a decision node in a
38 | tree can be used to assess the relative importance of that feature with
39 | respect to the predictability of the target variable. Features used at
40 | the top of the tree contribute to the final prediction decision of a
41 | larger fraction of the input samples. The **expected fraction of the
42 | samples** they contribute to can thus be used as an estimate of the
43 | **relative importance of the features**. In treeple, the fraction of
44 | samples a feature contributes to is combined with the decrease in impurity
45 | from splitting them to create a normalized estimate of the predictive power
46 | of that feature. This is essentially exactly the same it is done in scikit-learn.
47 | 
48 | By **averaging** the estimates of predictive ability over several randomized
49 | trees one can **reduce the variance** of such an estimate and use it
50 | for feature selection. This is known as the mean decrease in impurity, or MDI.
51 | Refer to [L2014]_ for more information on MDI and feature importance
52 | evaluation with Random Forests. We implement the approach taken in :footcite:`Li2023manifold`
53 | and :footcite:`TomitaSPORF2020`.
54 | 
55 | .. warning::
56 | 
57 |   The impurity-based feature importances computed on tree-based models suffer
58 |   from two flaws that can lead to misleading conclusions. First they are
59 |   computed on statistics derived from the training dataset and therefore **do
60 |   not necessarily inform us on which features are most important to make good
61 |   predictions on held-out dataset**. Secondly, **they favor high cardinality
62 |   features**, that is features with many unique values.
63 |   :ref:`sklearn:permutation_importance` is an alternative to impurity-based feature
64 |   importance that does not suffer from these flaws. These two methods of
65 |   obtaining feature importance are explored in:
66 |   :ref:`sklearn:sphx_glr_auto_examples_inspection_plot_permutation_importance.py`.
67 | 
68 | In practice those estimates are stored as an attribute named
69 | ``feature_importances_`` on the fitted model. This is an array with shape
70 | ``(n_features,)`` whose values are positive and sum to 1.0. The higher
71 | the value, the more important is the contribution of the matching feature
72 | to the prediction function.
73 | 
74 | .. topic:: References
75 | 
76 |  .. footbibliography::
77 | 
78 |  .. [L2014] Louppe, G. :arxiv:`"Understanding Random Forests: From Theory to
79 |     Practice" <1407.7502>`,
80 |     PhD Thesis, U. of Liege, 2014.
81 | 


--------------------------------------------------------------------------------
/treeple/tree/manifold/_morf_splitter.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language = c++
 2 | 
 3 | # Authors: Adam Li <adam2392@gmail.com>
 4 | #          Chester Huynh <chester.huynh924@gmail.com>
 5 | #          Parth Vora <pvora4@jhu.edu>
 6 | #
 7 | # License: BSD 3 clause
 8 | 
 9 | # See _oblique_splitter.pyx for details.
10 | 
11 | import numpy as np
12 | 
13 | from libcpp.vector cimport vector
14 | 
15 | from ..._lib.sklearn.tree._splitter cimport SplitRecord
16 | from ..._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t
17 | from .._oblique_splitter cimport BestObliqueSplitter, ObliqueSplitRecord
18 | 
19 | # https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/algorithm.pxd
20 | # shows how to include standard library functions in Cython
21 | # This includes the discrete_distribution C++ class, which can be used
22 | # to generate samples from a discrete distribution with non-uniform probabilities.
23 | # cdef extern from "<discrete_distribution>" namespace "std" nogil:
24 | #     cdef cppclass discrete_distribution[T]
25 | #         ctypedef T int_type
26 | #         ctypedef G generator_type
27 | #         discrete_distribution(T first, T last) except +
28 | #         operator()(&G) except +
29 | 
30 | cdef class PatchSplitter(BestObliqueSplitter):
31 |     # The PatchSplitter creates candidate feature values by sampling 2D patches from
32 |     # an input data vector. The input data is vectorized, so `data_height` and
33 |     # `data_width` are used to determine the vectorized indices corresponding to
34 |     # (x,y) coordinates in the original un-vectorized data.
35 |     cdef public intp_t ndim                       # The number of dimensions of the input data
36 | 
37 |     cdef const intp_t[:] data_dims                      # The dimensions of the input data
38 |     cdef const intp_t[:] min_patch_dims                 # The minimum size of the patch to sample in each dimension
39 |     cdef const intp_t[:] max_patch_dims                 # The maximum size of the patch to sample in each dimension
40 |     cdef const uint8_t[:] dim_contiguous            # A boolean array indicating whether each dimension is contiguous
41 | 
42 |     # TODO: check if this works and is necessary for discontiguous data
43 |     # cdef intp_t[:] stride_offsets                # The stride offsets for each dimension
44 |     cdef bint _discontiguous
45 | 
46 |     cdef bytes boundary                               # how to sample the patch with boundary in mind
47 |     cdef const float32_t[:, :] feature_weight               # Whether or not to normalize each column of X when adding in a patch
48 | 
49 |     cdef intp_t[::1] _index_data_buffer
50 |     cdef intp_t[::1] _index_patch_buffer
51 |     cdef intp_t[:] patch_sampled_size                # A buffer to store the dimensions of the sampled patch
52 |     cdef intp_t[:] unraveled_patch_point          # A buffer to store the unraveled patch point
53 | 
54 |     # All oblique splitters (i.e. non-axis aligned splitters) require a
55 |     # function to sample a projection matrix that is applied to the feature matrix
56 |     # to quickly obtain the sampled projections for candidate splits.
57 |     cdef (intp_t, intp_t) sample_top_left_seed(
58 |         self
59 |     ) noexcept nogil
60 | 
61 |     cdef void sample_proj_mat(
62 |         self,
63 |         vector[vector[float32_t]]& proj_mat_weights,
64 |         vector[vector[intp_t]]& proj_mat_indices
65 |     ) noexcept nogil
66 | 
67 | 
68 | # cdef class UserKernelSplitter(PatchSplitter):
69 | #     """A class to hold user-specified kernels."""
70 | #     cdef vector[float32_t[:, ::1]] kernel_dictionary  # A list of C-contiguous 2D kernels
71 | 
72 | 
73 | cdef class GaussianKernelSplitter(PatchSplitter):
74 |     """A class to hold Gaussian kernels.
75 | 
76 |     Overrides the weights that are generated to be sampled from a Gaussian distribution.
77 |     See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus
78 |     See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567
79 |     """
80 | 
81 |     cdef void sample_proj_mat(
82 |         self,
83 |         vector[vector[float32_t]]& proj_mat_weights,
84 |         vector[vector[intp_t]]& proj_mat_indices
85 |     ) noexcept nogil
86 | 


--------------------------------------------------------------------------------
/treeple/stats/tests/test_baseline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from numpy.testing import assert_array_equal
  4 | 
  5 | from treeple import HonestForestClassifier
  6 | from treeple.stats import (
  7 |     PermutationHonestForestClassifier,
  8 |     build_cv_forest,
  9 |     build_permutation_forest,
 10 | )
 11 | 
 12 | seed = 12345
 13 | rng = np.random.default_rng(seed)
 14 | 
 15 | 
 16 | @pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)])
 17 | def test_build_cv_honest_forest(bootstrap, max_samples):
 18 |     n_estimators = 100
 19 |     est = HonestForestClassifier(
 20 |         n_estimators=n_estimators,
 21 |         random_state=0,
 22 |         bootstrap=bootstrap,
 23 |         max_samples=max_samples,
 24 |         honest_fraction=0.5,
 25 |         stratify=True,
 26 |     )
 27 |     X = rng.normal(0, 1, (100, 2))
 28 |     X[:50] *= -1
 29 |     y = np.array([0, 1] * 50)
 30 |     samples = np.arange(len(y))
 31 | 
 32 |     est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest(
 33 |         est,
 34 |         X,
 35 |         y,
 36 |         return_indices=True,
 37 |         seed=seed,
 38 |         cv=3,
 39 |     )
 40 | 
 41 |     assert isinstance(est_list, list)
 42 |     assert isinstance(proba_list, list)
 43 | 
 44 |     for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list):
 45 |         assert len(train_idx) + len(test_idx) == len(samples)
 46 |         structure_samples = est.structure_indices_
 47 |         leaf_samples = est.honest_indices_
 48 | 
 49 |         if not bootstrap:
 50 |             oob_samples = [[] for _ in range(est.n_estimators)]
 51 |         else:
 52 |             oob_samples = est.oob_samples_
 53 | 
 54 |         # compared to oob samples, now the train samples are comprised of the entire dataset
 55 |         # seen over the entire forest. The test dataset is completely disjoint
 56 |         for tree_idx in range(est.n_estimators):
 57 |             n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx])
 58 |             assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), (
 59 |                 f"For tree: "
 60 |                 f"{tree_idx} {len(structure_samples[tree_idx])} + "
 61 |                 f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} "
 62 |                 f"!= {len(train_idx)} {len(test_idx)}"
 63 |             )
 64 | 
 65 | 
 66 | def test_build_permutation_forest():
 67 |     """Simple test for building a permutation forest."""
 68 |     n_estimators = 30
 69 |     n_samples = 100
 70 |     n_features = 3
 71 |     rng = np.random.default_rng(seed)
 72 | 
 73 |     _X = rng.uniform(size=(n_samples, n_features))
 74 |     _X = rng.uniform(size=(n_samples // 2, n_features))
 75 |     X2 = _X + 10
 76 |     X = np.vstack([_X, X2])
 77 |     y = np.vstack(
 78 |         [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
 79 |     )  # Binary classification
 80 | 
 81 |     clf = HonestForestClassifier(
 82 |         n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True
 83 |     )
 84 |     perm_clf = PermutationHonestForestClassifier(
 85 |         n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True
 86 |     )
 87 |     with pytest.raises(
 88 |         RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier"
 89 |     ):
 90 |         build_permutation_forest(clf, clf, X, y, seed=seed)
 91 | 
 92 |     forest_result, orig_forest_proba, perm_forest_proba = build_permutation_forest(
 93 |         clf, perm_clf, X, y, metric="s@98", n_repeats=20, seed=seed
 94 |     )
 95 |     assert forest_result.observe_test_stat > 0.1, f"{forest_result.observe_stat}"
 96 |     assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}"
 97 |     assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape)
 98 | 
 99 |     X = np.vstack([_X, _X])
100 |     forest_result, _, _ = build_permutation_forest(
101 |         clf, perm_clf, X, y, metric="s@98", n_repeats=10, seed=seed
102 |     )
103 |     assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"
104 |     assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}"
105 | 


--------------------------------------------------------------------------------
/examples/quantile_predictions/plot_quantile_interpolation_with_RF.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ========================================================
  3 | Predicting with different quantile interpolation methods
  4 | ========================================================
  5 | 
  6 | An example comparison of interpolation methods that can be applied during
  7 | prediction when the desired quantile lies between two data points.
  8 | 
  9 | This example was heavily inspired by `quantile-forest <https://github.com/zillow/quantile-forest>`_
 10 | package. See their package `here <https://zillow.github.io/quantile-forest/>`_.
 11 | """
 12 | 
 13 | from collections import defaultdict
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | import numpy as np
 17 | from sklearn.ensemble import RandomForestRegressor
 18 | 
 19 | # %%
 20 | # Generate the data
 21 | # -----------------
 22 | # We use four simple data points to illustrate the difference between the intervals that are
 23 | # generated using different interpolation methods.
 24 | 
 25 | X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
 26 | y = np.array([-2, -1, 0, 1, 2])
 27 | 
 28 | # %%
 29 | # The interpolation methods
 30 | # -------------------------
 31 | # The following interpolation methods demonstrated here are:
 32 | # To interpolate between the data points, i and j (``i <= j``),
 33 | # linear, lower, higher, midpoint, or nearest. For more details, see `treeple.RandomForestRegressor`.
 34 | # The difference between the methods can be illustrated with the following example:
 35 | 
 36 | interpolations = ["linear", "lower", "higher", "midpoint", "nearest"]
 37 | colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"]
 38 | quantiles = [0.025, 0.5, 0.975]
 39 | 
 40 | y_medians = []
 41 | y_errs = []
 42 | est = RandomForestRegressor(
 43 |     n_estimators=1,
 44 |     random_state=0,
 45 | )
 46 | # fit the model
 47 | est.fit(X, y)
 48 | # get the leaf nodes that each sample fell into
 49 | leaf_ids = est.apply(X)
 50 | # create a list of dictionary that maps node to samples that fell into it
 51 | # for each tree
 52 | node_to_indices = []
 53 | for tree in range(leaf_ids.shape[1]):
 54 |     d = defaultdict(list)
 55 |     for id, leaf in enumerate(leaf_ids[:, tree]):
 56 |         d[leaf].append(id)
 57 |     node_to_indices.append(d)
 58 | # drop the X_test to the trained tree and
 59 | # get the indices of leaf nodes that fall into it
 60 | leaf_ids_test = est.apply(X)
 61 | # for each samples, collect the indices of the samples that fell into
 62 | # the same leaf node for each tree
 63 | y_pred_quantile = []
 64 | for sample in range(leaf_ids_test.shape[0]):
 65 |     li = [
 66 |         node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])
 67 |     ]
 68 |     # merge the list of indices into one
 69 |     idx = [item for sublist in li for item in sublist]
 70 |     # get the y_train for each corresponding id``
 71 |     y_pred_quantile.append(y[idx])
 72 | 
 73 | for interpolation in interpolations:
 74 |     # get the quatile preditions for each predicted sample
 75 |     y_pred = [
 76 |         np.array(
 77 |             [
 78 |                 np.quantile(y_pred_quantile[i], quantile, method=interpolation)
 79 |                 for i in range(len(y_pred_quantile))
 80 |             ]
 81 |         )
 82 |         for quantile in quantiles
 83 |     ]
 84 |     y_medians.append(y_pred[1])
 85 |     y_errs.append(
 86 |         np.concatenate(
 87 |             (
 88 |                 [y_pred[1] - y_pred[0]],
 89 |                 [y_pred[2] - y_pred[1]],
 90 |             ),
 91 |             axis=0,
 92 |         )
 93 |     )
 94 | 
 95 | sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10)
 96 | ebs = []
 97 | for i, (median, y_err) in enumerate(zip(y_medians, y_errs)):
 98 |     ebs.append(
 99 |         plt.errorbar(
100 |             np.arange(len(y)) + (0.15 * (i + 1)) - 0.35,
101 |             median,
102 |             yerr=y_err,
103 |             color=colors[i],
104 |             ecolor=colors[i],
105 |             fmt="o",
106 |         )
107 |     )
108 | plt.xlim([-0.75, len(y) - 0.25])
109 | plt.xticks(np.arange(len(y)), X.tolist())
110 | plt.xlabel("Samples (Feature Values)")
111 | plt.ylabel("Actual and Predicted Values")
112 | plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2)
113 | plt.show()
114 | 


--------------------------------------------------------------------------------
/treeple/tests/test_extensions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from numpy.testing import assert_array_equal
  4 | from sklearn.datasets import make_classification
  5 | 
  6 | from treeple import (
  7 |     ExtraObliqueRandomForestClassifier,
  8 |     ExtraObliqueRandomForestRegressor,
  9 |     HonestForestClassifier,
 10 |     ObliqueRandomForestClassifier,
 11 |     ObliqueRandomForestRegressor,
 12 |     PatchObliqueRandomForestClassifier,
 13 |     PatchObliqueRandomForestRegressor,
 14 | )
 15 | 
 16 | 
 17 | @pytest.mark.parametrize("n_classes", [2, 3])
 18 | @pytest.mark.parametrize(
 19 |     "Forest",
 20 |     [
 21 |         HonestForestClassifier,
 22 |         ExtraObliqueRandomForestClassifier,
 23 |         ObliqueRandomForestClassifier,
 24 |         PatchObliqueRandomForestClassifier,
 25 |     ],
 26 | )
 27 | def test_predict_proba_per_tree(Forest, n_classes):
 28 |     # Assuming forest_model is an instance of a forest model with ForestMixin
 29 |     # You may need to adjust the actual implementation according to your specific model
 30 |     X, y = make_classification(
 31 |         n_samples=100, n_features=50, n_informative=20, n_classes=n_classes, random_state=0
 32 |     )
 33 | 
 34 |     # Call the method being tested
 35 |     if Forest == HonestForestClassifier:
 36 |         est = Forest(n_estimators=10, bootstrap=True, random_state=0, honest_prior="empirical")
 37 |     else:
 38 |         est = Forest(n_estimators=10, bootstrap=True, random_state=0)
 39 |     est.fit(X, y)
 40 |     proba_per_tree = est.predict_proba_per_tree(X)
 41 | 
 42 |     # Perform assertions to check the correctness of the output
 43 |     assert proba_per_tree.shape[0] == est.n_estimators
 44 |     assert proba_per_tree.shape[1] == X.shape[0]
 45 |     assert proba_per_tree.shape[2] == est.n_classes_
 46 |     assert not np.isnan(proba_per_tree).any()
 47 | 
 48 |     proba_per_tree = est.predict_proba_per_tree(X, est.oob_samples_)
 49 |     # Perform assertions to check the correctness of the output
 50 |     assert proba_per_tree.shape[0] == est.n_estimators
 51 |     assert proba_per_tree.shape[1] == X.shape[0]
 52 |     assert proba_per_tree.shape[2] == est.n_classes_
 53 |     assert np.isnan(proba_per_tree).any()
 54 | 
 55 | 
 56 | @pytest.mark.parametrize(
 57 |     "Forest",
 58 |     [
 59 |         HonestForestClassifier,
 60 |         ExtraObliqueRandomForestClassifier,
 61 |         ObliqueRandomForestClassifier,
 62 |         PatchObliqueRandomForestClassifier,
 63 |         ObliqueRandomForestRegressor,
 64 |         PatchObliqueRandomForestRegressor,
 65 |         ExtraObliqueRandomForestRegressor,
 66 |     ],
 67 | )
 68 | @pytest.mark.parametrize("bootstrap", [True, False])
 69 | @pytest.mark.parametrize("random_state", [None, 0])
 70 | def test_forest_has_deterministic_sampling_for_oob_structure_and_leaves(
 71 |     Forest, bootstrap, random_state
 72 | ):
 73 |     """Test that forest models can produce the oob and inbag samples deterministically.
 74 | 
 75 |     When bootstrap is True, oob should be exclusive from in bag samples.
 76 |     When bootstrap is False, there is no oob.
 77 |     """
 78 |     rng = np.random.default_rng(0)
 79 | 
 80 |     n_estimators = 5
 81 |     est = Forest(
 82 |         n_estimators=n_estimators,
 83 |         random_state=random_state,
 84 |         bootstrap=bootstrap,
 85 |     )
 86 |     X = rng.normal(0, 1, (100, 2))
 87 |     X[:50] *= -1
 88 |     y = [0, 1] * 50
 89 |     samples = np.arange(len(y))
 90 | 
 91 |     est.fit(X, y)
 92 | 
 93 |     inbag_samples = est.estimators_samples_
 94 |     oob_samples = [
 95 |         [idx for idx in samples if idx not in inbag_samples[jdx]] for jdx in range(n_estimators)
 96 |     ]
 97 |     if not bootstrap:
 98 |         assert all(oob_list_ == [] for oob_list_ in oob_samples)
 99 | 
100 |         with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
101 |             est.oob_samples_
102 |     else:
103 |         oob_samples_ = est.oob_samples_
104 |         for itree in range(n_estimators):
105 |             assert len(oob_samples[itree]) > 1, oob_samples[itree]
106 |             assert set(inbag_samples[itree]).intersection(set(oob_samples_[itree])) == set()
107 |             assert set(inbag_samples[itree]).union(set(oob_samples_[itree])) == set(samples)
108 |             assert_array_equal(oob_samples_[itree], oob_samples[itree])
109 | 


--------------------------------------------------------------------------------
/treeple/stats/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | import scipy.sparse as sp
  7 | from numpy.testing import assert_array_equal
  8 | 
  9 | import treeple.stats.utils as utils
 10 | from treeple import HonestForestClassifier
 11 | from treeple.stats.utils import get_per_tree_oob_samples
 12 | 
 13 | seed = 1234
 14 | rng = np.random.default_rng(seed)
 15 | 
 16 | 
 17 | @pytest.mark.parametrize("bootstrap", [True, False])
 18 | def test_get_per_tree_oob_samples(bootstrap):
 19 |     n_estimators = 5
 20 |     est = HonestForestClassifier(n_estimators=n_estimators, random_state=0, bootstrap=bootstrap)
 21 | 
 22 |     X = rng.normal(0, 1, (100, 2))
 23 |     X[:50] *= -1
 24 |     y = [0, 1] * 50
 25 |     samples = np.arange(len(y))
 26 |     est.fit(X, y)
 27 | 
 28 |     if bootstrap:
 29 |         inbag_samples = est.estimators_samples_
 30 |         oob_samples = [
 31 |             [idx for idx in samples if idx not in inbag_samples[jdx]] for jdx in range(n_estimators)
 32 |         ]
 33 |         oob_samples_ = get_per_tree_oob_samples(est)
 34 |         for itree in range(n_estimators):
 35 |             assert len(oob_samples[itree]) > 1
 36 |             assert_array_equal(oob_samples_[itree], oob_samples[itree])
 37 |     else:
 38 |         with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
 39 |             get_per_tree_oob_samples(est)
 40 | 
 41 | 
 42 | @pytest.mark.parametrize("use_bottleneck", [True, False])
 43 | def test_non_nan_samples(use_bottleneck: bool):
 44 |     if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
 45 |         del os.environ[utils.DISABLE_BN_ENV_VAR]
 46 |         importlib.reload(utils)
 47 |     else:
 48 |         os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
 49 |         importlib.reload(utils)
 50 | 
 51 |     posterior_array = np.array(
 52 |         [
 53 |             # tree 1
 54 |             [
 55 |                 [0, 1],
 56 |                 [np.nan, np.nan],
 57 |                 [np.nan, np.nan],
 58 |             ],
 59 |             # tree 2
 60 |             [
 61 |                 [0, 1],
 62 |                 [np.nan, np.nan],
 63 |                 [1, 0],
 64 |             ],
 65 |         ]
 66 |     )  # [2, 3, 2]
 67 | 
 68 |     expected = np.array([0, 2])
 69 |     actual = utils._non_nan_samples(posterior_array)
 70 |     np.testing.assert_array_equal(expected, actual)
 71 | 
 72 | 
 73 | @pytest.mark.parametrize("use_bottleneck", [True, False])
 74 | def test_nanmean_f(use_bottleneck: bool):
 75 |     if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
 76 |         del os.environ[utils.DISABLE_BN_ENV_VAR]
 77 |         importlib.reload(utils)
 78 |     else:
 79 |         os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
 80 |         importlib.reload(utils)
 81 | 
 82 |     posterior_array = np.array(
 83 |         [
 84 |             [1, 2, np.nan],
 85 |             [3, 4, np.nan],
 86 |         ]
 87 |     )
 88 | 
 89 |     expected = np.array([1.5, 3.5])
 90 |     actual = utils.nanmean_f(posterior_array, axis=1)
 91 |     np.testing.assert_array_equal(expected, actual)
 92 | 
 93 | 
 94 | @pytest.mark.parametrize(
 95 |     ("forest_indices", "expected"),
 96 |     [
 97 |         (np.arange(3), np.array([0.375, 0.75, 0.25])),
 98 |         (np.arange(3) + 2, np.array([0.10, 0.05, 0.25])),
 99 |         (np.arange(3) + 3, np.array([0.10, 0.45, np.nan])),
100 |     ],
101 | )
102 | def test_get_forest_preds_sparse(
103 |     forest_indices,
104 |     expected,
105 | ):
106 | 
107 |     all_y_pred = sp.csc_matrix(
108 |         np.array(
109 |             [
110 |                 [0.50, 0.00, 0.00],
111 |                 [0.25, 0.75, 0.00],
112 |                 [0.00, 0.00, 0.25],
113 |                 [0.10, 0.00, 0.00],
114 |                 [0.00, 0.05, 0.00],
115 |                 [0.00, 0.85, 0.00],
116 |             ]
117 |         )
118 |     )
119 | 
120 |     all_y_indicator = sp.csc_matrix(
121 |         np.array(
122 |             [
123 |                 [1, 0, 0],
124 |                 [1, 1, 0],
125 |                 [0, 0, 1],
126 |                 [1, 0, 0],
127 |                 [0, 1, 0],
128 |                 [0, 1, 0],
129 |             ]
130 |         )
131 |     )
132 | 
133 |     np.testing.assert_array_equal(
134 |         utils._get_forest_preds_sparse(all_y_pred, all_y_indicator, forest_indices),
135 |         expected,
136 |     )
137 | 


--------------------------------------------------------------------------------
/treeple/tests/test_unsupervised_forest.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from sklearn import datasets
  4 | from sklearn.cluster import AgglomerativeClustering
  5 | from sklearn.datasets import make_blobs
  6 | from sklearn.metrics import adjusted_rand_score
  7 | from sklearn.utils.estimator_checks import parametrize_with_checks
  8 | 
  9 | from treeple.ensemble import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest
 10 | 
 11 | CLUSTER_CRITERIONS = ("twomeans", "fastbic")
 12 | 
 13 | FOREST_CLUSTERS = {
 14 |     "UnsupervisedRandomForest": UnsupervisedRandomForest,
 15 |     "UnsupervisedObliqueRandomForest": UnsupervisedObliqueRandomForest,
 16 | }
 17 | 
 18 | # load iris dataset
 19 | iris = datasets.load_iris()
 20 | rng = np.random.RandomState(1)
 21 | perm = rng.permutation(iris.target.size)
 22 | iris.data = iris.data[perm]
 23 | iris.target = iris.target[perm]
 24 | 
 25 | 
 26 | @parametrize_with_checks(
 27 |     [
 28 |         UnsupervisedRandomForest(random_state=12345, n_estimators=50),
 29 |         UnsupervisedObliqueRandomForest(random_state=12345, n_estimators=50),
 30 |     ]
 31 | )
 32 | def test_sklearn_compatible_estimator(estimator, check):
 33 |     if check.func.__name__ in [
 34 |         # Cannot apply agglomerative clustering on < 2 samples
 35 |         "check_methods_subset_invariance",
 36 |         # sample weights do not necessarily imply a sample is not used in clustering
 37 |         "check_sample_weight_equivalence",
 38 |         "check_sample_weight_equivalence_on_dense_data",
 39 |         "check_sample_weight_equivalence_on_sparse_data",
 40 |         # sample order is not preserved in predict
 41 |         "check_methods_sample_order_invariance",
 42 |     ]:
 43 |         pytest.skip()
 44 |     check(estimator)
 45 | 
 46 | 
 47 | @pytest.mark.parametrize("name, forest", FOREST_CLUSTERS.items())
 48 | @pytest.mark.parametrize("criterion", CLUSTER_CRITERIONS)
 49 | def test_check_simulation(name, forest, criterion):
 50 |     n_samples = 200
 51 |     n_classes = 2
 52 | 
 53 |     #
 54 |     if name == "UnsupervisedRandomForest":
 55 |         n_features = 5
 56 |         if criterion == "twomeans":
 57 |             expected_score = 0.05
 58 |         elif criterion == "fastbic":
 59 |             expected_score = 0.35
 60 |     else:
 61 |         n_features = 20
 62 | 
 63 |         # in the forest setting, we can overfit the training dataset perfectly
 64 |         expected_score = 1.0
 65 |     X, y = make_blobs(
 66 |         n_samples=n_samples, centers=n_classes, n_features=n_features, random_state=12345
 67 |     )
 68 | 
 69 |     clf = forest(criterion=criterion, random_state=12345)
 70 |     clf.fit(X)
 71 |     sim_mat = clf.compute_similarity_matrix(X)
 72 | 
 73 |     # all ones along the diagonal
 74 |     assert np.array_equal(sim_mat.diagonal(), np.ones(n_samples))
 75 | 
 76 |     cluster = AgglomerativeClustering(n_clusters=n_classes).fit(sim_mat)
 77 |     predict_labels = cluster.fit_predict(sim_mat)
 78 |     score = adjusted_rand_score(y, predict_labels)
 79 | 
 80 |     # XXX: This should be > 0.9 according to the UReRF. However, that could be because they used
 81 |     # the oblique projections by default
 82 |     assert (
 83 |         score >= expected_score
 84 |     ), f"{name}-blobs failed with criterion {criterion} and score = {score}"
 85 | 
 86 | 
 87 | @pytest.mark.parametrize("name, forest", FOREST_CLUSTERS.items())
 88 | @pytest.mark.parametrize("criterion", CLUSTER_CRITERIONS)
 89 | def test_check_iris(name, forest, criterion):
 90 |     # Check consistency on dataset iris.
 91 |     n_classes = 3
 92 |     est = forest(criterion=criterion, random_state=12345)
 93 |     est.fit(iris.data, iris.target)
 94 |     sim_mat = est.compute_similarity_matrix(iris.data)
 95 | 
 96 |     if criterion == "twomeans":
 97 |         if "oblique" in name.lower():
 98 |             expected_score = 0.21
 99 |         else:
100 |             expected_score = 0.2
101 |     elif criterion == "fastbic":
102 |         if "oblique" in name.lower():
103 |             expected_score = 0.55
104 |         else:
105 |             expected_score = 0.3
106 | 
107 |     cluster = AgglomerativeClustering(n_clusters=n_classes).fit(sim_mat)
108 |     predict_labels = cluster.fit_predict(sim_mat)
109 |     score = adjusted_rand_score(iris.target, predict_labels)
110 | 
111 |     # Two-means and fastBIC criterions perform similarly here
112 |     assert (
113 |         score > expected_score
114 |     ), f"{name}-iris failed with criterion {criterion} and score = {score}"
115 | 


--------------------------------------------------------------------------------
/treeple/meson.build:
--------------------------------------------------------------------------------
  1 | # Platform detection
  2 | is_windows = host_machine.system() == 'windows'
  3 | is_mingw = is_windows and cc.get_id() == 'gcc'
  4 | 
  5 | c_args = []
  6 | cython_c_args = []
  7 | if is_windows
  8 |   # For mingw-w64, link statically against the UCRT.
  9 |   gcc_link_args = ['-lucrt', '-static']
 10 |   if is_mingw
 11 |     add_project_link_arguments(gcc_link_args, language: ['c', 'cpp'])
 12 |     # Force gcc to float64 long doubles for compatibility with MSVC
 13 |     # builds, for C only.
 14 |     add_project_arguments('-mlong-double-64', language: 'c')
 15 |     # Make fprintf("%zd") work (see https://github.com/rgommers/scipy/issues/118)
 16 |     add_project_arguments('-D__USE_MINGW_ANSI_STDIO=1', language: ['c', 'cpp'])
 17 |     # Manual add of MS_WIN64 macro when not using MSVC.
 18 |     # https://bugs.python.org/issue28267
 19 |     bitness = run_command(
 20 |       '_build_utils/gcc_build_bitness.py',
 21 |       check: true
 22 |     ).stdout().strip()
 23 |     if bitness == '64'
 24 |       add_project_arguments('-DMS_WIN64', language: ['c', 'cpp'])
 25 |     endif
 26 |     # Silence warnings emitted by PyOS_snprintf for (%zd), see
 27 |     # https://github.com/rgommers/scipy/issues/118.
 28 |     # Use as c_args for extensions containing Cython code
 29 |     c_args += ['-Wno-format-extra-args', '-Wno-format']
 30 |   endif
 31 | endif
 32 | 
 33 | openmp_dep = dependency('OpenMP', language: 'c', required: false)
 34 | 
 35 | if not openmp_dep.found()
 36 |     warning(
 37 | '''
 38 |                 ***********
 39 |                 * WARNING *
 40 |                 ***********
 41 | 
 42 | It seems that treeple cannot be built with OpenMP.
 43 | 
 44 | - Make sure you have followed the installation instructions:
 45 | 
 46 |     https://scikit-learn.org/dev/developers/advanced_installation.html
 47 | 
 48 | - If your compiler supports OpenMP but you still see this
 49 |   message, please submit a bug report at:
 50 | 
 51 |     https://github.com/treeple/treeple/issues
 52 | 
 53 | - The build will continue with OpenMP-based parallelism
 54 |   disabled. Note however that some estimators will run in
 55 |   sequential mode instead of leveraging thread-based
 56 |   parallelism.
 57 | 
 58 |                     ***
 59 | ''')
 60 | endif
 61 | 
 62 | # NumPy include directory - needed in all submodules
 63 | incdir_numpy = meson.get_external_property('numpy-include-dir', 'not-given')
 64 | if incdir_numpy == 'not-given'
 65 |   incdir_numpy = run_command(py,
 66 |     [
 67 |       '-c',
 68 |       '''
 69 | import os
 70 | import numpy as np
 71 | try:
 72 |   incdir = os.path.relpath(np.get_include())
 73 | except Exception:
 74 |   incdir = np.get_include()
 75 | print(incdir)
 76 | '''
 77 |     ],
 78 |     check: true
 79 |   ).stdout().strip()
 80 | endif
 81 | 
 82 | inc_np = include_directories(incdir_numpy)
 83 | # Don't use the deprecated NumPy C API. Define this to a fixed version instead of
 84 | # NPY_API_VERSION in order not to break compilation for released SciPy versions
 85 | # when NumPy introduces a new deprecation.
 86 | numpy_no_deprecated_api = ['-DNPY_NO_DEPRECATED_API=NPY_1_9_API_VERSION']
 87 | np_dep = declare_dependency(include_directories: inc_np, compile_args: numpy_no_deprecated_api)
 88 | 
 89 | cc = meson.get_compiler('c')
 90 | 
 91 | # Don't use the deprecated NumPy C API. Define this to a fixed version instead of
 92 | # NPY_API_VERSION in order not to break compilation for released versions
 93 | # when NumPy introduces a new deprecation. Use in a meson.build file::
 94 | #
 95 | # py.extension_module('_name',
 96 | #   'source_fname',
 97 | #   numpy_nodepr_api)
 98 | 
 99 | # TODO XXX: ENABLE WHEN DEBUGGING
100 | boundscheck = 'False'
101 | 
102 | scikit_learn_cython_args = [
103 |   '-X language_level=3', '-X boundscheck=' + boundscheck, '-X wraparound=False',
104 |   '-X initializedcheck=False', '-X nonecheck=False', '-X cdivision=True',
105 |   '-X profile=False',
106 |   '-X embedsignature=True',
107 |   # Needed for cython imports across subpackages, e.g. cluster pyx that
108 |   # cimports metrics pxd
109 |   '--include-dir', meson.global_build_root(),
110 | ]
111 | cython_c_args += scikit_learn_cython_args
112 | 
113 | python_sources = [
114 |   '__init__.py',
115 |   'neighbors.py',
116 |   'conftest.py',
117 | ]
118 | 
119 | py.install_sources(
120 |   python_sources,
121 |   subdir: 'treeple'
122 | )
123 | 
124 | subdir('_lib')
125 | subdir('ensemble')
126 | subdir('experimental')
127 | subdir('stats')
128 | subdir('tests')
129 | subdir('tree')
130 | subdir('datasets')
131 | 


--------------------------------------------------------------------------------
/treeple/experimental/tests/test_sdf.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | from sklearn import datasets
  4 | from sklearn.metrics import accuracy_score, r2_score
  5 | from sklearn.utils.estimator_checks import parametrize_with_checks
  6 | 
  7 | from treeple.experimental import StreamDecisionForest
  8 | 
  9 | CLF_CRITERIONS = ("gini", "entropy")
 10 | 
 11 | # also load the iris dataset
 12 | # and randomly permute it
 13 | iris = datasets.load_iris()
 14 | rng = np.random.RandomState(1)
 15 | perm = rng.permutation(iris.target.size)
 16 | iris.data = iris.data[perm]
 17 | iris.target = iris.target[perm]
 18 | 
 19 | 
 20 | def test_toy_accuracy():
 21 |     clf = StreamDecisionForest(n_estimators=10)
 22 |     X = np.ones((20, 4))
 23 |     X[10:] *= -1
 24 |     y = [0] * 10 + [1] * 10
 25 |     clf = clf.fit(X, y)
 26 |     np.testing.assert_array_equal(clf.predict(X), y)
 27 | 
 28 | 
 29 | def test_first_fit():
 30 |     clf = StreamDecisionForest(n_estimators=10)
 31 |     with pytest.raises(
 32 |         ValueError, match="classes must be passed on the first call to partial_fit."
 33 |     ):
 34 |         clf.partial_fit(iris.data, iris.target)
 35 | 
 36 | 
 37 | @pytest.mark.parametrize("criterion", ["gini", "entropy"])
 38 | @pytest.mark.parametrize("max_features", [None, 2])
 39 | def test_iris(criterion, max_features):
 40 |     # Check consistency on dataset iris.
 41 |     clf = StreamDecisionForest(
 42 |         criterion=criterion,
 43 |         random_state=0,
 44 |         max_features=max_features,
 45 |         n_estimators=10,
 46 |     )
 47 | 
 48 |     clf.partial_fit(iris.data, iris.target, classes=np.unique(iris.target))
 49 |     score = accuracy_score(clf.predict(iris.data), iris.target)
 50 | 
 51 |     assert score > 0.5 and score <= 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
 52 |         "SDF", criterion, score
 53 |     )
 54 | 
 55 |     score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
 56 |     assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
 57 |         "SDF", criterion, score
 58 |     )
 59 | 
 60 |     clf.partial_fit(iris.data, iris.target)
 61 |     score = accuracy_score(clf.predict(iris.data), iris.target)
 62 | 
 63 |     assert (
 64 |         score > 0.5 and score <= 1.0
 65 |     ), "Failed partial_fit with {0}, criterion = {1} and score = {2}".format(
 66 |         "SDF", criterion, score
 67 |     )
 68 | 
 69 |     score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
 70 |     assert score == 1.0, "Failed partial_fit with {0}, criterion = {1} and score = {2}".format(
 71 |         "SDF", criterion, score
 72 |     )
 73 | 
 74 | 
 75 | @pytest.mark.parametrize("criterion", ["gini", "entropy"])
 76 | @pytest.mark.parametrize("max_features", [None, 2])
 77 | def test_iris_multi(criterion, max_features):
 78 |     # Check consistency on dataset iris.
 79 |     clf = StreamDecisionForest(
 80 |         criterion=criterion,
 81 |         random_state=0,
 82 |         max_features=max_features,
 83 |         n_estimators=10,
 84 |     )
 85 | 
 86 |     second_y = np.concatenate([(np.ones(50) * 3), (np.ones(50) * 4), (np.ones(50) * 5)])
 87 | 
 88 |     X = iris.data
 89 |     y = np.stack((iris.target, second_y[perm])).T
 90 | 
 91 |     clf.fit(X, y)
 92 |     score = r2_score(clf.predict(X), y)
 93 |     assert score > 0.9 and score <= 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
 94 |         "SDF", criterion, score
 95 |     )
 96 | 
 97 | 
 98 | def test_max_samples():
 99 |     max_samples_list = [8, 0.5, None]
100 |     depths = []
101 |     X = rng.normal(0, 1, (100, 2))
102 |     X[:50] *= -1
103 |     y = [0, 1] * 50
104 |     for ms in max_samples_list:
105 |         uf = StreamDecisionForest(n_estimators=2, random_state=0, max_samples=ms, bootstrap=True)
106 |         uf = uf.fit(X, y)
107 |         depths.append(uf.estimators_[0].get_depth())
108 | 
109 |     assert all(np.diff(depths) > 0)
110 | 
111 | 
112 | @parametrize_with_checks([StreamDecisionForest(n_estimators=10, random_state=0)])
113 | def test_sklearn_compatible_estimator(estimator, check):
114 |     # 1. check_class_weight_classifiers is not supported since it requires sample weight
115 |     # XXX: can include this "generalization" in the future if it's useful
116 |     if check.func.__name__ in [
117 |         "check_class_weight_classifiers",
118 |         "check_sample_weight_equivalence",
119 |         "check_sample_weight_equivalence_on_dense_data",
120 |         "check_sample_weight_equivalence_on_sparse_data",
121 |     ]:
122 |         pytest.skip()
123 |     check(estimator)
124 | 


--------------------------------------------------------------------------------
/examples/sparse_oblique_trees/plot_oblique_random_forest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ===============================================================================
  3 | Plot oblique forest and axis-aligned random forest predictions on cc18 datasets
  4 | ===============================================================================
  5 | 
  6 | A performance comparison between oblique forest and standard axis-
  7 | aligned random forest using three datasets from OpenML benchmarking suites.
  8 | 
  9 | Two of these datasets, namely
 10 | [WDBC](https://www.openml.org/search?type=data&sort=runs&id=1510)
 11 | and [Phishing Website](https://www.openml.org/search?type=data&sort=runs&id=4534)
 12 | datasets consist of 31 features where the former dataset is entirely numeric
 13 | and the latter dataset is entirely norminal. The third dataset, dubbed
 14 | [cnae-9](https://www.openml.org/search?type=data&status=active&id=1468), is a
 15 | numeric dataset that has notably large feature space of 857 features. As you
 16 | will notice, of these three datasets, the oblique forest outperforms axis-aligned
 17 | random forest on cnae-9 utilizing sparse random projection mechanism. All datasets
 18 | are subsampled due to computational constraints.
 19 | 
 20 | For an example of using extra-oblique trees/forests in practice on data, see the following
 21 | example :ref:`sphx_glr_auto_examples_sparse_oblique_trees_plot_extra_oblique_random_forest.py`.
 22 | """
 23 | 
 24 | from datetime import datetime
 25 | 
 26 | import matplotlib.pyplot as plt
 27 | import pandas as pd
 28 | import seaborn as sns
 29 | from sklearn.datasets import fetch_openml
 30 | from sklearn.ensemble import RandomForestClassifier
 31 | from sklearn.model_selection import RepeatedKFold, cross_validate
 32 | 
 33 | from treeple import ObliqueRandomForestClassifier
 34 | 
 35 | random_state = 123456
 36 | t0 = datetime.now()
 37 | data_ids = [4534, 1510, 1468]  # openml dataset id
 38 | df = pd.DataFrame()
 39 | 
 40 | 
 41 | def load_cc18(data_id):
 42 |     df = fetch_openml(data_id=data_id, as_frame=True, parser="pandas")
 43 | 
 44 |     # extract the dataset name
 45 |     d_name = df.details["name"]
 46 | 
 47 |     # Subsampling large datasets
 48 |     if data_id == 1468:
 49 |         n = 100
 50 |     else:
 51 |         n = int(df.frame.shape[0] * 0.8)
 52 | 
 53 |     df = df.frame.sample(n, random_state=random_state)
 54 |     X, y = df.iloc[:, :-1], df.iloc[:, -1]
 55 | 
 56 |     return X, y, d_name
 57 | 
 58 | 
 59 | def get_scores(X, y, d_name, n_cv=5, n_repeats=1, **kwargs):
 60 |     clfs = [RandomForestClassifier(**kwargs), ObliqueRandomForestClassifier(**kwargs)]
 61 | 
 62 |     tmp = []
 63 | 
 64 |     for i, clf in enumerate(clfs):
 65 |         cv = RepeatedKFold(n_splits=n_cv, n_repeats=n_repeats, random_state=kwargs["random_state"])
 66 |         test_score = cross_validate(estimator=clf, X=X, y=y, cv=cv, scoring="accuracy")
 67 | 
 68 |         tmp.append(
 69 |             [
 70 |                 d_name,
 71 |                 ["RF", "OF"][i],
 72 |                 test_score["test_score"],
 73 |                 test_score["test_score"].mean(),
 74 |             ]
 75 |         )
 76 | 
 77 |     df = pd.DataFrame(
 78 |         tmp, columns=["dataset", "model", "score", "mean"]
 79 |     )  # dtype=[('model',object), ('score',float), ('mean',float)])
 80 |     df = df.explode("score")
 81 |     df["score"] = df["score"].astype(float)
 82 |     df.reset_index(inplace=True, drop=True)
 83 | 
 84 |     return df
 85 | 
 86 | 
 87 | params = {
 88 |     "max_features": None,
 89 |     "n_estimators": 50,
 90 |     "max_depth": None,
 91 |     "random_state": random_state,
 92 |     "n_cv": 2,
 93 |     "n_repeats": 1,
 94 | }
 95 | 
 96 | for data_id in data_ids:
 97 |     X, y, d_name = load_cc18(data_id=data_id)
 98 |     print(f"Loading [{d_name}] dataset..")
 99 |     tmp = get_scores(X=X, y=y, d_name=d_name, **params)
100 |     df = pd.concat([df, tmp])
101 | 
102 | print(f"It took {(datetime.now()-t0).seconds} seconds to run the script")
103 | 
104 | # Draw a comparison plot
105 | d_names = df.dataset.unique()
106 | N = d_names.shape[0]
107 | 
108 | fig, ax = plt.subplots(1, N)
109 | fig.set_size_inches(6 * N, 6)
110 | 
111 | for i, name in enumerate(d_names):
112 |     sns.stripplot(
113 |         data=df.query(f'dataset == "{name}"'),
114 |         x="model",
115 |         y="score",
116 |         ax=ax[i],
117 |         dodge=True,
118 |     )
119 |     sns.boxplot(
120 |         data=df.query(f'dataset == "{name}"'),
121 |         x="model",
122 |         y="score",
123 |         ax=ax[i],
124 |         color="white",
125 |     )
126 |     ax[i].set_title(name)
127 |     if i != 0:
128 |         ax[i].set_ylabel("")
129 |     ax[i].set_xlabel("")
130 | 


--------------------------------------------------------------------------------
/examples/treeple/treeple_tutorial_1_1a_SA98.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ================
  3 | Calculating S@98
  4 | ================
  5 | """
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import seaborn as sns
 10 | from sklearn.metrics import roc_curve
 11 | 
 12 | from treeple.datasets import make_trunk_classification
 13 | from treeple.ensemble import HonestForestClassifier
 14 | from treeple.stats import build_oob_forest
 15 | 
 16 | sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
 17 | PALETTE = sns.color_palette("Set1")
 18 | sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
 19 | sns.set_style("white", {"axes.edgecolor": "#dddddd"})
 20 | 
 21 | # %%
 22 | # S@98
 23 | # ----
 24 | #
 25 | # Sensitivity at 98% specificity (*S@98*) measures, namely, the true
 26 | # positive rate (*TPR*) when the false positive rate (*FPR*) is at 98%.
 27 | #
 28 | # .. math:: S@r = \mathbb{P}[\eta(X) > T_r \mid Y=1]
 29 | #
 30 | # With a binary class simulation as an example, this tutorial will show
 31 | # how to use ``treeple`` to calculate the statistic.
 32 | 
 33 | # %%
 34 | # Create a simulation with two gaussians
 35 | # --------------------------------------
 36 | 
 37 | 
 38 | # create a binary class simulation with two gaussians
 39 | # 500 samples for each class, class zero is standard
 40 | # gaussian, and class one has a mean at one
 41 | X, y = make_trunk_classification(
 42 |     n_samples=1000,
 43 |     n_dim=1,
 44 |     mu_0=0,
 45 |     mu_1=1,
 46 |     n_informative=1,
 47 |     seed=1,
 48 | )
 49 | 
 50 | 
 51 | fig, ax = plt.subplots(figsize=(6, 6))
 52 | fig.tight_layout()
 53 | ax.tick_params(labelsize=15)
 54 | 
 55 | # histogram plot the samples
 56 | ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
 57 | ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
 58 | ax.set_xlabel("Variable One", fontsize=15)
 59 | ax.set_ylabel("Likelihood", fontsize=15)
 60 | plt.legend(frameon=False, fontsize=15)
 61 | plt.show()
 62 | 
 63 | # %%
 64 | # Fit the model
 65 | # -------------
 66 | 
 67 | 
 68 | # initialize the forest with 100 trees
 69 | est = HonestForestClassifier(
 70 |     n_estimators=100,
 71 |     max_samples=1.6,
 72 |     max_features=0.3,
 73 |     bootstrap=True,
 74 |     stratify=True,
 75 |     random_state=1,
 76 | )
 77 | 
 78 | # fit the model and obtain the tree posteriors
 79 | _, observe_proba = build_oob_forest(est, X, y)
 80 | 
 81 | # generate forest posteriors for the two classes
 82 | observe_proba = np.nanmean(observe_proba, axis=0)
 83 | 
 84 | 
 85 | fig, ax = plt.subplots(figsize=(6, 6))
 86 | fig.tight_layout()
 87 | ax.tick_params(labelsize=15)
 88 | 
 89 | # histogram plot the posterior probabilities for class one
 90 | ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
 91 | ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
 92 | ax.set_ylabel("# of Samples", fontsize=15)
 93 | ax.set_xlabel("Class One Posterior", fontsize=15)
 94 | plt.legend(frameon=False, fontsize=15)
 95 | plt.show()
 96 | 
 97 | # %%
 98 | # Calculate the statistic
 99 | # -----------------------
100 | 
101 | 
102 | def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float:
103 |     """Calculate the sensitivity at a specific specificity"""
104 |     # check the shape of true labels
105 |     if y_true.squeeze().ndim != 1:
106 |         raise ValueError(f"y_true must be 1d, not {y_true.shape}")
107 | 
108 |     # find the positive class and calculate fpr and tpr
109 |     if 0 in y_true or -1 in y_true:
110 |         fpr, tpr, thresholds = roc_curve(
111 |             y_true, y_pred_proba[:, 1], pos_label=1, drop_intermediate=False
112 |         )
113 |     else:
114 |         fpr, tpr, thresholds = roc_curve(
115 |             y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
116 |         )
117 |     sa98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr])
118 | 
119 |     fig, ax = plt.subplots(figsize=(6, 6))
120 |     fig.tight_layout()
121 |     ax.tick_params(labelsize=15)
122 |     ax.set_xlim([-0.005, 1.005])
123 |     ax.set_ylim([-0.005, 1.005])
124 |     ax.set_xlabel("False Positive Rate", fontsize=15)
125 |     ax.set_ylabel("True Positive Rate", fontsize=15)
126 | 
127 |     ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1])
128 | 
129 |     spec = int((1 - max_fpr) * 100)
130 |     ax.axvline(
131 |         x=max_fpr,
132 |         color=PALETTE[0],
133 |         ymin=0,
134 |         ymax=sa98,
135 |         label="S@" + str(spec) + " = " + str(round(sa98, 2)),
136 |         linestyle="--",
137 |     )
138 |     ax.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--")
139 |     ax.legend(frameon=False, fontsize=15)
140 | 
141 |     return sa98
142 | 
143 | 
144 | sa98 = Calculate_SA(y, observe_proba, max_fpr=0.02)
145 | print("S@98 =", round(sa98, 2))
146 | # sphinx_gallery_thumbnail_number = -1
147 | 


--------------------------------------------------------------------------------
/examples/calibration/plot_honest_tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ===========================================
  3 | Comparison of Decision Tree and Honest Tree
  4 | ===========================================
  5 | 
  6 | This example compares the :class:`treeple.tree.HonestTreeClassifier` from the
  7 | ``treeple`` library with the :class:`sklearn.tree.DecisionTreeClassifier`
  8 | from scikit-learn on the Iris dataset.
  9 | 
 10 | Both classifiers are fitted on the same dataset and their decision trees
 11 | are plotted side by side.
 12 | """
 13 | 
 14 | import matplotlib.pyplot as plt
 15 | from sklearn import config_context
 16 | from sklearn.datasets import load_iris
 17 | from sklearn.model_selection import train_test_split
 18 | from sklearn.tree import DecisionTreeClassifier, plot_tree
 19 | 
 20 | from treeple.tree import HonestTreeClassifier
 21 | 
 22 | # Load the iris dataset
 23 | iris = load_iris()
 24 | X, y = iris.data, iris.target
 25 | X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=0)
 26 | 
 27 | # Initialize classifiers
 28 | max_features = 0.3
 29 | 
 30 | dishonest_clf = HonestTreeClassifier(
 31 |     honest_method=None,
 32 |     max_features=max_features,
 33 |     random_state=0,
 34 |     honest_prior="ignore",
 35 | )
 36 | honest_noprune_clf = HonestTreeClassifier(
 37 |     honest_method="apply",
 38 |     max_features=max_features,
 39 |     random_state=0,
 40 |     honest_prior="ignore",
 41 | )
 42 | honest_clf = HonestTreeClassifier(honest_method="prune", max_features=max_features, random_state=0)
 43 | sklearn_clf = DecisionTreeClassifier(max_features=max_features, random_state=0)
 44 | 
 45 | # Fit classifiers
 46 | dishonest_clf.fit(X_train, y_train)
 47 | honest_noprune_clf.fit(X_train, y_train)
 48 | honest_clf.fit(X_train, y_train)
 49 | sklearn_clf.fit(X_train, y_train)
 50 | 
 51 | # Plotting the trees
 52 | fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5))
 53 | 
 54 | # .. note:: We skip parameter validation because internally the `plot_tree`
 55 | #           function checks if the estimator is a DecisionTreeClassifier
 56 | #           instance from scikit-learn, but the ``HonestTreeClassifier`` is
 57 | #           a subclass of a forked version of the DecisionTreeClassifier.
 58 | 
 59 | # Plot HonestTreeClassifier tree
 60 | ax = axes[2]
 61 | with config_context(skip_parameter_validation=True):
 62 |     plot_tree(honest_clf, filled=True, ax=ax)
 63 | ax.set_title("HonestTreeClassifier")
 64 | 
 65 | # Plot HonestTreeClassifier tree
 66 | ax = axes[1]
 67 | with config_context(skip_parameter_validation=True):
 68 |     plot_tree(honest_noprune_clf, filled=False, ax=ax)
 69 | ax.set_title("HonestTreeClassifier (No pruning)")
 70 | 
 71 | # Plot HonestTreeClassifier tree
 72 | ax = axes[0]
 73 | with config_context(skip_parameter_validation=True):
 74 |     plot_tree(dishonest_clf, filled=False, ax=ax)
 75 | ax.set_title("HonestTreeClassifier (Dishonest)")
 76 | 
 77 | 
 78 | # Plot scikit-learn DecisionTreeClassifier tree
 79 | plot_tree(sklearn_clf, filled=True, ax=axes[3])
 80 | axes[3].set_title("DecisionTreeClassifier")
 81 | 
 82 | plt.show()
 83 | 
 84 | # %%
 85 | # Discussion
 86 | # ----------
 87 | # The HonestTreeClassifier is a variant of the DecisionTreeClassifier that
 88 | # provides honest inference. The honest inference is achieved by splitting the
 89 | # dataset into two parts: the training set and the validation set. The training
 90 | # set is used to build the tree, while the validation set is used to fit the
 91 | # leaf nodes for posterior prediction. This results in calibrated posteriors
 92 | # (see :ref:`sphx_glr_auto_examples_calibration_plot_overlapping_gaussians.py`).
 93 | #
 94 | # Compared to the ``honest_prior='apply'`` method, the ``honest_prior='prune'``
 95 | # method builds a tree that will not contain empty leaves, and also leverages
 96 | # the validation set to check split conditions. Thus we see that the pruned
 97 | # honest tree is significantly smaller than the regular decision tree.
 98 | 
 99 | # %%
100 | # Evaluate predictions of the trees
101 | # ---------------------------------
102 | # When we do not prune, note that the honest tree will have empty leaves
103 | # that predict the prior. In this case, ``honest_prior='ignore'`` is used
104 | # to ignore these leaves when computing the posteriors, which will result
105 | # in a posterior that is ``np.nan``.
106 | 
107 | # this is the same as a decision tree classifier that is trained on less data
108 | print("\nDishonest posteriors: ", dishonest_clf.predict_proba(X_val))
109 | 
110 | # this is the honest tree with empty leaves that predict the prior
111 | print("\nHonest tree without pruning: ", honest_noprune_clf.predict_proba(X_val))
112 | 
113 | # this is the honest tree that is pruned
114 | print("\nHonest tree with pruning: ", honest_clf.predict_proba(X_val))
115 | 
116 | # this is a regular decision tree classifier from sklearn
117 | print("\nDTC: ", sklearn_clf.predict_proba(X_val))
118 | 


--------------------------------------------------------------------------------