├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── NOTICE
├── README.md
├── bayesian_decision_tree
    ├── __init__.py
    ├── _version.py
    ├── base.py
    ├── base_hyperplane.py
    ├── base_perpendicular.py
    ├── classification.py
    ├── hyperplane_optimization.py
    ├── regression.py
    └── utils.py
├── conda.recipe
    ├── conda_build_config.yaml
    └── meta.yaml
├── examples
    ├── __init__.py
    ├── demo_classification_hyperplane.py
    ├── demo_classification_perpendicular.py
    ├── demo_classification_trading.py
    ├── demo_regression_hyperplane.py
    ├── demo_regression_perpendicular.py
    └── helper.py
├── mypy.ini
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    └── unit
    │   ├── __init__.py
    │   ├── helper.py
    │   ├── test_classification.py
    │   ├── test_regression.py
    │   └── test_utils.py
└── versioneer.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # OSX useful to ignore
  7 | *.DS_Store
  8 | .AppleDouble
  9 | .LSOverride
 10 | 
 11 | # Thumbnails
 12 | ._*
 13 | 
 14 | # Files that might appear in the root of a volume
 15 | .DocumentRevisions-V100
 16 | .fseventsd
 17 | .Spotlight-V100
 18 | .TemporaryItems
 19 | .Trashes
 20 | .VolumeIcon.icns
 21 | .com.apple.timemachine.donotpresent
 22 | 
 23 | # Directories potentially created on remote AFP share
 24 | .AppleDB
 25 | .AppleDesktop
 26 | Network Trash Folder
 27 | Temporary Items
 28 | .apdisk
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | env/
 36 | build/
 37 | develop-eggs/
 38 | dist/
 39 | downloads/
 40 | eggs/
 41 | .eggs/
 42 | lib/
 43 | lib64/
 44 | parts/
 45 | sdist/
 46 | var/
 47 | *.egg-info/
 48 | .installed.cfg
 49 | *.egg
 50 | 
 51 | # PyInstaller
 52 | #  Usually these files are written by a python script from a template
 53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 54 | *.manifest
 55 | *.spec
 56 | 
 57 | # Installer logs
 58 | pip-log.txt
 59 | pip-delete-this-directory.txt
 60 | 
 61 | # Unit test / coverage reports
 62 | htmlcov/
 63 | .tox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *,cover
 70 | .hypothesis/
 71 | 
 72 | # Translations
 73 | *.mo
 74 | *.pot
 75 | 
 76 | # Django stuff:
 77 | *.log
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # IntelliJ Idea family of suites
 83 | .idea
 84 | *.iml
 85 | ## File-based project format:
 86 | *.ipr
 87 | *.iws
 88 | ## mpeltonen/sbt-idea plugin
 89 | .idea_modules/
 90 | 
 91 | # PyBuilder
 92 | target/
 93 | 
 94 | # Cookiecutter
 95 | output/
 96 | python_boilerplate/
 97 | 
 98 | .idea
 99 | target
100 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/)
 5 | and this project adheres to [Semantic Versioning](https://semver.org/).
 6 | 
 7 | ## [0.7] - 2020-06-211
 8 | ### Changed
 9 | - Improve prediction performance
10 | - Improve training performance and memory use
11 | 
12 | ## [0.6] - 2020-05-28
13 | ### Added
14 | - New method `_get_raw_leaf_data()` to access leaf internals
15 | 
16 | ## [0.5] - 2020-05-26
17 | ### Changed
18 | - Removed references to `SparseDataFrame` (deprecated)
19 | 
20 | ### Fixed
21 | - Bug in regression code
22 | 
23 | ## [0.4] - 2020-03-09
24 | ### Changed
25 | - Better hyperplane tree plotting
26 | 
27 | ### Fixed
28 | - Bug in hyperplane trees (tried to access uninitialized field)
29 | 
30 | ## [0.3] - 2020-02-26
31 | ### Added
32 | - Improved scikit-learn compatibility further
33 | 
34 | ### Fixed
35 | - Bug in `model.feature_importance()` computation
36 | 
37 | ## [0.2] - 2019-09-02
38 | ### Added
39 | - Experimental support for arbitrarily-oriented hyperplane splits rather than axis-perpendicular ones only
40 | - Experimental support for sparse DataFrames and sparse matrices (scipy.sparse) for fitting and prediction
41 | - Added `model.feature_importance()` for feature selection
42 | - All models now compatible with scikit-learn models
43 | 
44 | ### Changed
45 | - Lots of small changes here and there
46 | 
47 | ## [0.1] - 2019-02-06
48 | ### Added
49 | - First release
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2019 UBS Limited
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Bayesian Tree
2 | Copyright 2018-2019 UBS AG
3 | 
4 | This product includes software developed at
5 | UBS AG (https://www.ubs.com)
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A Bayesian Decision Tree Algorithm
 2 | This is an implementation of the paper: [A Bayesian Decision Tree Algorithm](https://arxiv.org/abs/1901.03214) by Nuti et al.
 3 | 
 4 | ## Feature Support
 5 | 
 6 | This package implements:
 7 | * Classification (binary and multiclass)
 8 | * Regression
 9 | * Both models are available in two versions respectively:
10 |   * **Perpendicular Trees**:
11 |     The classic decision/regression tree structure with splits along a single
12 |     feature dimension (i.e., _perpendicular_ to a feature dimension axis),
13 |     analogous to e.g. the scikit-learn
14 |     [decision](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
15 |     and
16 |     [regression](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)
17 |     trees.
18 |     
19 |     The models are called
20 |     [`PerpendicularClassificationTree`](bayesian_decision_tree/classification.py)
21 |     and
22 |     [`PerpendicularRegressionTree`](bayesian_decision_tree/regression.py).
23 |      
24 |   * **Hyperplane Trees**:
25 |     Decision/regression trees using _arbitrarily-oriented hyperplanes_. These models
26 |     are more flexible than perpendicular trees as they cover a much larger search
27 |     space to naturally make use of correlations between features.
28 |     
29 |     All else equal, hyperplane trees typically lead to shallower trees with fewer
30 |     leaf nodes compared to their perpendicular counterparts because they can employ
31 |     more than just a single feature dimension per split. This can lead to less
32 |     overfitting and better generalization performance, but no such guarantees exist
33 |     because hyperplane trees are still being constructed in a greedy manner.
34 |     
35 |     Note that hyperplane trees take much longer to train and need to be trained
36 |     stochastically using global optimizers due to the exponentially large search
37 |     space.
38 |     
39 |     The models are called
40 |     [`HyperplaneClassificationTree`](bayesian_decision_tree/classification.py)
41 |     and
42 |     [`HyperplaneRegressionTree`](bayesian_decision_tree/regression.py).
43 | 
44 | ## Installation
45 | 
46 | To install you can either use _conda_ or _pip_:
47 | 
48 | #### Conda
49 | ```
50 | git clone https://github.com/UBS-IB/bayesian_tree
51 | cd bayesian_tree
52 | conda build conda.recipe
53 | conda install --use-local bayesian_decision_tree
54 | ```
55 | 
56 | #### PIP
57 | ```
58 | git clone https://github.com/UBS-IB/bayesian_tree
59 | cd bayesian_tree
60 | pip install -e .
61 | ```
62 | 
63 | ## Usage
64 | 
65 | We include some examples for various uses in the [examples](examples) directory.
66 | The models are fully compatible with scikit-learn, so you can use them for e.g.
67 | cross-validation or performance evaluation using scikit-learn functions.
68 | 
69 | ## TODO
70 | - Add parallelization option (dask)
71 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import get_versions
2 | __version__ = get_versions()['version']
3 | del get_versions
4 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/_version.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # This file helps to compute a version number in source trees obtained from
  3 | # git-archive tarball (such as those provided by githubs download-from-tag
  4 | # feature). Distribution tarballs (built by setup.py sdist) and build
  5 | # directories (produced by setup.py build) will contain a much shorter file
  6 | # that just contains the computed version number.
  7 | 
  8 | # This file is released into the public domain. Generated by
  9 | # versioneer-0.18 (https://github.com/warner/python-versioneer)
 10 | 
 11 | """Git implementation of _version.py."""
 12 | 
 13 | import errno
 14 | import os
 15 | import re
 16 | import subprocess
 17 | import sys
 18 | 
 19 | 
 20 | def get_keywords():
 21 |     """Get the keywords needed to look up the version information."""
 22 |     # these strings will be replaced by git during git-archive.
 23 |     # setup.py/versioneer.py will grep for the variable names, so they must
 24 |     # each be defined on a line of their own. _version.py will just call
 25 |     # get_keywords().
 26 |     git_refnames = "$Format:%d$"
 27 |     git_full = "$Format:%H$"
 28 |     git_date = "$Format:%ci$"
 29 |     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
 30 |     return keywords
 31 | 
 32 | 
 33 | class VersioneerConfig:
 34 |     """Container for Versioneer configuration parameters."""
 35 | 
 36 | 
 37 | def get_config():
 38 |     """Create, populate and return the VersioneerConfig() object."""
 39 |     # these strings are filled in when 'setup.py versioneer' creates
 40 |     # _version.py
 41 |     cfg = VersioneerConfig()
 42 |     cfg.VCS = "git"
 43 |     cfg.style = ""
 44 |     cfg.tag_prefix = ""
 45 |     cfg.parentdir_prefix = "hadoop-utils-"
 46 |     cfg.versionfile_source = "hadoop_utils/_version.py"
 47 |     cfg.verbose = False
 48 |     return cfg
 49 | 
 50 | 
 51 | class NotThisMethod(Exception):
 52 |     """Exception raised if a method is not valid for the current scenario."""
 53 | 
 54 | 
 55 | LONG_VERSION_PY = {}
 56 | HANDLERS = {}
 57 | 
 58 | 
 59 | def register_vcs_handler(vcs, method):  # decorator
 60 |     """Decorator to mark a method as the handler for a particular VCS."""
 61 |     def decorate(f):
 62 |         """Store f in HANDLERS[vcs][method]."""
 63 |         if vcs not in HANDLERS:
 64 |             HANDLERS[vcs] = {}
 65 |         HANDLERS[vcs][method] = f
 66 |         return f
 67 |     return decorate
 68 | 
 69 | 
 70 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
 71 |                 env=None):
 72 |     """Call the given command(s)."""
 73 |     assert isinstance(commands, list)
 74 |     p = None
 75 |     for c in commands:
 76 |         try:
 77 |             dispcmd = str([c] + args)
 78 |             # remember shell=False, so use git.cmd on windows, not just git
 79 |             p = subprocess.Popen([c] + args, cwd=cwd, env=env,
 80 |                                  stdout=subprocess.PIPE,
 81 |                                  stderr=(subprocess.PIPE if hide_stderr
 82 |                                          else None))
 83 |             break
 84 |         except EnvironmentError:
 85 |             e = sys.exc_info()[1]
 86 |             if e.errno == errno.ENOENT:
 87 |                 continue
 88 |             if verbose:
 89 |                 print("unable to run %s" % dispcmd)
 90 |                 print(e)
 91 |             return None, None
 92 |     else:
 93 |         if verbose:
 94 |             print("unable to find command, tried %s" % (commands,))
 95 |         return None, None
 96 |     stdout = p.communicate()[0].strip()
 97 |     if sys.version_info[0] >= 3:
 98 |         stdout = stdout.decode()
 99 |     if p.returncode != 0:
100 |         if verbose:
101 |             print("unable to run %s (error)" % dispcmd)
102 |             print("stdout was %s" % stdout)
103 |         return None, p.returncode
104 |     return stdout, p.returncode
105 | 
106 | 
107 | def versions_from_parentdir(parentdir_prefix, root, verbose):
108 |     """Try to determine the version from the parent directory name.
109 | 
110 |     Source tarballs conventionally unpack into a directory that includes both
111 |     the project name and a version string. We will also support searching up
112 |     two directory levels for an appropriately named parent directory
113 |     """
114 |     rootdirs = []
115 | 
116 |     for i in range(3):
117 |         dirname = os.path.basename(root)
118 |         if dirname.startswith(parentdir_prefix):
119 |             return {"version": dirname[len(parentdir_prefix):],
120 |                     "full-revisionid": None,
121 |                     "dirty": False, "error": None, "date": None}
122 |         else:
123 |             rootdirs.append(root)
124 |             root = os.path.dirname(root)  # up a level
125 | 
126 |     if verbose:
127 |         print("Tried directories %s but none started with prefix %s" %
128 |               (str(rootdirs), parentdir_prefix))
129 |     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
130 | 
131 | 
132 | @register_vcs_handler("git", "get_keywords")
133 | def git_get_keywords(versionfile_abs):
134 |     """Extract version information from the given file."""
135 |     # the code embedded in _version.py can just fetch the value of these
136 |     # keywords. When used from setup.py, we don't want to import _version.py,
137 |     # so we do it with a regexp instead. This function is not used from
138 |     # _version.py.
139 |     keywords = {}
140 |     try:
141 |         f = open(versionfile_abs, "r")
142 |         for line in f.readlines():
143 |             if line.strip().startswith("git_refnames ="):
144 |                 mo = re.search(r'=\s*"(.*)"', line)
145 |                 if mo:
146 |                     keywords["refnames"] = mo.group(1)
147 |             if line.strip().startswith("git_full ="):
148 |                 mo = re.search(r'=\s*"(.*)"', line)
149 |                 if mo:
150 |                     keywords["full"] = mo.group(1)
151 |             if line.strip().startswith("git_date ="):
152 |                 mo = re.search(r'=\s*"(.*)"', line)
153 |                 if mo:
154 |                     keywords["date"] = mo.group(1)
155 |         f.close()
156 |     except EnvironmentError:
157 |         pass
158 |     return keywords
159 | 
160 | 
161 | @register_vcs_handler("git", "keywords")
162 | def git_versions_from_keywords(keywords, tag_prefix, verbose):
163 |     """Get version information from git keywords."""
164 |     if not keywords:
165 |         raise NotThisMethod("no keywords at all, weird")
166 |     date = keywords.get("date")
167 |     if date is not None:
168 |         # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
169 |         # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
170 |         # -like" string, which we must then edit to make compliant), because
171 |         # it's been around since git-1.5.3, and it's too difficult to
172 |         # discover which version we're using, or to work around using an
173 |         # older one.
174 |         date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
175 |     refnames = keywords["refnames"].strip()
176 |     if refnames.startswith("$Format"):
177 |         if verbose:
178 |             print("keywords are unexpanded, not using")
179 |         raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
180 |     refs = set([r.strip() for r in refnames.strip("()").split(",")])
181 |     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
182 |     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
183 |     TAG = "tag: "
184 |     tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
185 |     if not tags:
186 |         # Either we're using git < 1.8.3, or there really are no tags. We use
187 |         # a heuristic: assume all version tags have a digit. The old git %d
188 |         # expansion behaves like git log --decorate=short and strips out the
189 |         # refs/heads/ and refs/tags/ prefixes that would let us distinguish
190 |         # between branches and tags. By ignoring refnames without digits, we
191 |         # filter out many common branch names like "release" and
192 |         # "stabilization", as well as "HEAD" and "master".
193 |         tags = set([r for r in refs if re.search(r'\d', r)])
194 |         if verbose:
195 |             print("discarding '%s', no digits" % ",".join(refs - tags))
196 |     if verbose:
197 |         print("likely tags: %s" % ",".join(sorted(tags)))
198 |     for ref in sorted(tags):
199 |         # sorting will prefer e.g. "2.0" over "2.0rc1"
200 |         if ref.startswith(tag_prefix):
201 |             r = ref[len(tag_prefix):]
202 |             if verbose:
203 |                 print("picking %s" % r)
204 |             return {"version": r,
205 |                     "full-revisionid": keywords["full"].strip(),
206 |                     "dirty": False, "error": None,
207 |                     "date": date}
208 |     # no suitable tags, so version is "0+unknown", but full hex is still there
209 |     if verbose:
210 |         print("no suitable tags, using unknown + full revision id")
211 |     return {"version": "0+unknown",
212 |             "full-revisionid": keywords["full"].strip(),
213 |             "dirty": False, "error": "no suitable tags", "date": None}
214 | 
215 | 
216 | @register_vcs_handler("git", "pieces_from_vcs")
217 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
218 |     """Get version from 'git describe' in the root of the source tree.
219 | 
220 |     This only gets called if the git-archive 'subst' keywords were *not*
221 |     expanded, and _version.py hasn't already been rewritten with a short
222 |     version string, meaning we're inside a checked out source tree.
223 |     """
224 |     GITS = ["git"]
225 |     if sys.platform == "win32":
226 |         GITS = ["git.cmd", "git.exe"]
227 | 
228 |     out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
229 |                           hide_stderr=True)
230 |     if rc != 0:
231 |         if verbose:
232 |             print("Directory %s not under git control" % root)
233 |         raise NotThisMethod("'git rev-parse --git-dir' returned error")
234 | 
235 |     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
236 |     # if there isn't one, this yields HEX[-dirty] (no NUM)
237 |     describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
238 |                                           "--always", "--long",
239 |                                           "--match", "%s*" % tag_prefix],
240 |                                    cwd=root)
241 |     # --long was added in git-1.5.5
242 |     if describe_out is None:
243 |         raise NotThisMethod("'git describe' failed")
244 |     describe_out = describe_out.strip()
245 |     full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
246 |     if full_out is None:
247 |         raise NotThisMethod("'git rev-parse' failed")
248 |     full_out = full_out.strip()
249 | 
250 |     pieces = {}
251 |     pieces["long"] = full_out
252 |     pieces["short"] = full_out[:7]  # maybe improved later
253 |     pieces["error"] = None
254 | 
255 |     # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
256 |     # TAG might have hyphens.
257 |     git_describe = describe_out
258 | 
259 |     # look for -dirty suffix
260 |     dirty = git_describe.endswith("-dirty")
261 |     pieces["dirty"] = dirty
262 |     if dirty:
263 |         git_describe = git_describe[:git_describe.rindex("-dirty")]
264 | 
265 |     # now we have TAG-NUM-gHEX or HEX
266 | 
267 |     if "-" in git_describe:
268 |         # TAG-NUM-gHEX
269 |         mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
270 |         if not mo:
271 |             # unparseable. Maybe git-describe is misbehaving?
272 |             pieces["error"] = ("unable to parse git-describe output: '%s'"
273 |                                % describe_out)
274 |             return pieces
275 | 
276 |         # tag
277 |         full_tag = mo.group(1)
278 |         if not full_tag.startswith(tag_prefix):
279 |             if verbose:
280 |                 fmt = "tag '%s' doesn't start with prefix '%s'"
281 |                 print(fmt % (full_tag, tag_prefix))
282 |             pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
283 |                                % (full_tag, tag_prefix))
284 |             return pieces
285 |         pieces["closest-tag"] = full_tag[len(tag_prefix):]
286 | 
287 |         # distance: number of commits since tag
288 |         pieces["distance"] = int(mo.group(2))
289 | 
290 |         # commit: short hex revision ID
291 |         pieces["short"] = mo.group(3)
292 | 
293 |     else:
294 |         # HEX: no tags
295 |         pieces["closest-tag"] = None
296 |         count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
297 |                                     cwd=root)
298 |         pieces["distance"] = int(count_out)  # total number of commits
299 | 
300 |     # commit date: see ISO-8601 comment in git_versions_from_keywords()
301 |     date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
302 |                        cwd=root)[0].strip()
303 |     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
304 | 
305 |     return pieces
306 | 
307 | 
308 | def plus_or_dot(pieces):
309 |     """Return a + if we don't already have one, else return a ."""
310 |     if "+" in pieces.get("closest-tag", ""):
311 |         return "."
312 |     return "+"
313 | 
314 | 
315 | def render_pep440(pieces):
316 |     """Build up version string, with post-release "local version identifier".
317 | 
318 |     Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
319 |     get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
320 | 
321 |     Exceptions:
322 |     1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
323 |     """
324 |     if pieces["closest-tag"]:
325 |         rendered = pieces["closest-tag"]
326 |         if pieces["distance"] or pieces["dirty"]:
327 |             rendered += plus_or_dot(pieces)
328 |             rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
329 |             if pieces["dirty"]:
330 |                 rendered += ".dirty"
331 |     else:
332 |         # exception #1
333 |         rendered = "0+untagged.%d.g%s" % (pieces["distance"],
334 |                                           pieces["short"])
335 |         if pieces["dirty"]:
336 |             rendered += ".dirty"
337 |     return rendered
338 | 
339 | 
340 | def render_pep440_pre(pieces):
341 |     """TAG[.post.devDISTANCE] -- No -dirty.
342 | 
343 |     Exceptions:
344 |     1: no tags. 0.post.devDISTANCE
345 |     """
346 |     if pieces["closest-tag"]:
347 |         rendered = pieces["closest-tag"]
348 |         if pieces["distance"]:
349 |             rendered += ".post.dev%d" % pieces["distance"]
350 |     else:
351 |         # exception #1
352 |         rendered = "0.post.dev%d" % pieces["distance"]
353 |     return rendered
354 | 
355 | 
356 | def render_pep440_post(pieces):
357 |     """TAG[.postDISTANCE[.dev0]+gHEX] .
358 | 
359 |     The ".dev0" means dirty. Note that .dev0 sorts backwards
360 |     (a dirty tree will appear "older" than the corresponding clean one),
361 |     but you shouldn't be releasing software with -dirty anyways.
362 | 
363 |     Exceptions:
364 |     1: no tags. 0.postDISTANCE[.dev0]
365 |     """
366 |     if pieces["closest-tag"]:
367 |         rendered = pieces["closest-tag"]
368 |         if pieces["distance"] or pieces["dirty"]:
369 |             rendered += ".post%d" % pieces["distance"]
370 |             if pieces["dirty"]:
371 |                 rendered += ".dev0"
372 |             rendered += plus_or_dot(pieces)
373 |             rendered += "g%s" % pieces["short"]
374 |     else:
375 |         # exception #1
376 |         rendered = "0.post%d" % pieces["distance"]
377 |         if pieces["dirty"]:
378 |             rendered += ".dev0"
379 |         rendered += "+g%s" % pieces["short"]
380 |     return rendered
381 | 
382 | 
383 | def render_pep440_old(pieces):
384 |     """TAG[.postDISTANCE[.dev0]] .
385 | 
386 |     The ".dev0" means dirty.
387 | 
388 |     Eexceptions:
389 |     1: no tags. 0.postDISTANCE[.dev0]
390 |     """
391 |     if pieces["closest-tag"]:
392 |         rendered = pieces["closest-tag"]
393 |         if pieces["distance"] or pieces["dirty"]:
394 |             rendered += ".post%d" % pieces["distance"]
395 |             if pieces["dirty"]:
396 |                 rendered += ".dev0"
397 |     else:
398 |         # exception #1
399 |         rendered = "0.post%d" % pieces["distance"]
400 |         if pieces["dirty"]:
401 |             rendered += ".dev0"
402 |     return rendered
403 | 
404 | 
405 | def render_git_describe(pieces):
406 |     """TAG[-DISTANCE-gHEX][-dirty].
407 | 
408 |     Like 'git describe --tags --dirty --always'.
409 | 
410 |     Exceptions:
411 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
412 |     """
413 |     if pieces["closest-tag"]:
414 |         rendered = pieces["closest-tag"]
415 |         if pieces["distance"]:
416 |             rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
417 |     else:
418 |         # exception #1
419 |         rendered = pieces["short"]
420 |     if pieces["dirty"]:
421 |         rendered += "-dirty"
422 |     return rendered
423 | 
424 | 
425 | def render_git_describe_long(pieces):
426 |     """TAG-DISTANCE-gHEX[-dirty].
427 | 
428 |     Like 'git describe --tags --dirty --always -long'.
429 |     The distance/hash is unconditional.
430 | 
431 |     Exceptions:
432 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
433 |     """
434 |     if pieces["closest-tag"]:
435 |         rendered = pieces["closest-tag"]
436 |         rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
437 |     else:
438 |         # exception #1
439 |         rendered = pieces["short"]
440 |     if pieces["dirty"]:
441 |         rendered += "-dirty"
442 |     return rendered
443 | 
444 | 
445 | def render(pieces, style):
446 |     """Render the given version pieces into the requested style."""
447 |     if pieces["error"]:
448 |         return {"version": "unknown",
449 |                 "full-revisionid": pieces.get("long"),
450 |                 "dirty": None,
451 |                 "error": pieces["error"],
452 |                 "date": None}
453 | 
454 |     if not style or style == "default":
455 |         style = "pep440"  # the default
456 | 
457 |     if style == "pep440":
458 |         rendered = render_pep440(pieces)
459 |     elif style == "pep440-pre":
460 |         rendered = render_pep440_pre(pieces)
461 |     elif style == "pep440-post":
462 |         rendered = render_pep440_post(pieces)
463 |     elif style == "pep440-old":
464 |         rendered = render_pep440_old(pieces)
465 |     elif style == "git-describe":
466 |         rendered = render_git_describe(pieces)
467 |     elif style == "git-describe-long":
468 |         rendered = render_git_describe_long(pieces)
469 |     else:
470 |         raise ValueError("unknown style '%s'" % style)
471 | 
472 |     return {"version": rendered, "full-revisionid": pieces["long"],
473 |             "dirty": pieces["dirty"], "error": None,
474 |             "date": pieces.get("date")}
475 | 
476 | 
477 | def get_versions():
478 |     """Get version information or return default if unable to do so."""
479 |     # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
480 |     # __file__, we can work backwards from there to the root. Some
481 |     # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
482 |     # case we can only use expanded keywords.
483 | 
484 |     cfg = get_config()
485 |     verbose = cfg.verbose
486 | 
487 |     try:
488 |         return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
489 |                                           verbose)
490 |     except NotThisMethod:
491 |         pass
492 | 
493 |     try:
494 |         root = os.path.realpath(__file__)
495 |         # versionfile_source is the relative path from the top of the source
496 |         # tree (where the .git directory might live) to this file. Invert
497 |         # this to find the root from __file__.
498 |         for i in cfg.versionfile_source.split('/'):
499 |             root = os.path.dirname(root)
500 |     except NameError:
501 |         return {"version": "0+unknown", "full-revisionid": None,
502 |                 "dirty": None,
503 |                 "error": "unable to find root of source tree",
504 |                 "date": None}
505 | 
506 |     try:
507 |         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
508 |         return render(pieces, cfg.style)
509 |     except NotThisMethod:
510 |         pass
511 | 
512 |     try:
513 |         if cfg.parentdir_prefix:
514 |             return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
515 |     except NotThisMethod:
516 |         pass
517 | 
518 |     return {"version": "0+unknown", "full-revisionid": None,
519 |             "dirty": None,
520 |             "error": "unable to compute version", "date": None}
521 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/base.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from scipy.sparse import csr_matrix, csc_matrix
  6 | from sklearn.base import BaseEstimator
  7 | 
  8 | 
  9 | class BaseTree(ABC, BaseEstimator):
 10 |     """
 11 |     Abstract base class of all Bayesian decision tree models (classification and regression). Performs all
 12 |     high-level fitting and prediction tasks and outsources the medium- and low-level work to subclasses.
 13 | 
 14 |     Implementation note: This class hierarchy is diamond-shaped: The four concrete model classes each
 15 |     inherit from two superclasses which in turn inherit from this class.
 16 |     """
 17 | 
 18 |     def __init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level):
 19 |         self.partition_prior = partition_prior
 20 |         self.prior = prior
 21 |         self.delta = delta
 22 |         self.prune = prune
 23 |         self.child_type = child_type
 24 |         self.is_regression = is_regression
 25 |         self.split_precision = split_precision
 26 |         self.level = level
 27 | 
 28 |     def fit(self, X, y, verbose=False, feature_names=None):
 29 |         """
 30 |         Trains this classification or regression tree using the training set (X, y).
 31 | 
 32 |         Parameters
 33 |         ----------
 34 |         X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features]
 35 |             The training input samples.
 36 | 
 37 |         y : array-like, shape = [n_samples] or [n_samples, n_outputs]
 38 |             The target values. In case of binary classification only the
 39 |             integers 0 and 1 are permitted. In case of multiclass classification
 40 |             only the integers 0, 1, ..., {n_classes-1} are permitted. In case of
 41 |             regression all finite float values are permitted.
 42 | 
 43 |         verbose : bool, default=False
 44 |             Prints fitting progress.
 45 | 
 46 |         feature_names: array-lie, shape = [n_features]
 47 |             An optional sequence of feature names. If not provided then 'x0', 'x1', ... is used
 48 |             if X is a matrix, or the column headers if X is a DataFrame.
 49 | 
 50 |         References
 51 |         ----------
 52 | 
 53 |         .. [1] https://arxiv.org/abs/1901.03214
 54 |         """
 55 | 
 56 |         # validation and input transformation
 57 |         if isinstance(y, list):
 58 |             y = np.array(y)
 59 | 
 60 |         y = y.squeeze()
 61 |         y = self._ensure_float64(y)
 62 |         self._check_target(y)
 63 | 
 64 |         X, feature_names = self._normalize_data_and_feature_names(X, feature_names)
 65 |         if X.shape[0] != len(y):
 66 |             raise ValueError('Invalid shapes: X={}, y={}'.format(X.shape, y.shape))
 67 | 
 68 |         # fit
 69 |         self._fit(X, y, verbose, feature_names, 'root')
 70 | 
 71 |         if self.prune:
 72 |             self._prune()
 73 | 
 74 |         return self
 75 | 
 76 |     def predict(self, X):
 77 |         """Predict class or regression value for X.
 78 | 
 79 |         For a classification model, the predicted class for each sample in X is
 80 |         returned. For a regression model, the predicted value based on X is
 81 |         returned.
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features]
 86 |             The input samples.
 87 | 
 88 |         Returns
 89 |         -------
 90 |         y : array of shape = [n_samples]
 91 |             The predicted classes, or the predict values.
 92 |         """
 93 | 
 94 |         # input transformation and checks
 95 |         X, _ = self._normalize_data_and_feature_names(X)
 96 |         self._ensure_is_fitted(X)
 97 | 
 98 |         indices = np.arange(X.shape[0])
 99 |         y = np.zeros(X.shape[0])
100 |         self._predict(X, indices, True, y)
101 | 
102 |         return y
103 | 
104 |     def feature_importance(self):
105 |         """
106 |         Compute and return feature importance of this tree after having fitted it to data. Feature
107 |         importance for a given feature dimension is defined as the sum of all increases in the
108 |         marginal data log-likelihood across splits of that dimension. Finally, the feature
109 |         importance vector is normalized to sum to 1.
110 | 
111 |         Returns
112 |         -------
113 |         feature_importance: array of floats
114 |             The feature importance.
115 |         """
116 | 
117 |         self._ensure_is_fitted()
118 | 
119 |         feature_importance = np.zeros(self.n_dim_)
120 |         self._update_feature_importance(feature_importance)
121 |         feature_importance /= feature_importance.sum()
122 | 
123 |         return feature_importance
124 | 
125 |     def _predict(self, X, indices, predict_class, y):
126 |         if self.is_leaf():
127 |             prediction = self._get_raw_leaf_data_internal() if predict_class is None \
128 |                             else self._predict_leaf() if predict_class \
129 |                             else self._compute_posterior_mean()
130 |             for i in indices:
131 |                 y[i] = prediction
132 |         else:
133 |             dense = isinstance(X, np.ndarray)
134 |             if not dense and isinstance(X, csr_matrix):
135 |                 # column accesses coming up, so convert to CSC sparse matrix format
136 |                 X = csc_matrix(X)
137 | 
138 |             # query both children, let them predict their side, and then re-assemble
139 |             indices1, indices2 = self._compute_child1_and_child2_indices(X, indices, dense)
140 | 
141 |             if len(indices1) > 0:
142 |                 self.child1_._predict(X, indices[indices1], predict_class, y)
143 | 
144 |             if len(indices2) > 0:
145 |                 self.child2_._predict(X, indices[indices2], predict_class, y)
146 | 
147 |     def _prune(self):
148 |         if self.is_leaf():
149 |             return
150 | 
151 |         depth_start = self.get_depth()
152 |         n_leaves_start = self.get_n_leaves()
153 | 
154 |         if self.child1_.is_leaf() and self.child2_.is_leaf():
155 |             if self.child1_._predict_leaf() == self.child2_._predict_leaf():
156 |                 # same prediction (class if classification, value if regression) -> no need to split
157 |                 self._erase_split_info_base()
158 |                 self._erase_split_info()
159 |         else:
160 |             self.child1_._prune()
161 |             self.child2_._prune()
162 | 
163 |         if depth_start != self.get_depth() or n_leaves_start != self.get_n_leaves():
164 |             # we did some pruning somewhere down this sub-tree -> prune again
165 |             self._prune()
166 | 
167 |     def _get_raw_leaf_data(self, X):
168 |         """Returns the raw predicted leaf data.
169 | 
170 |         For both classification and regression models, the following  data is returned for each row of X:
171 |         [[prior], [posterior]]. This method directly accesses implementation details and should therefore
172 |         be used with caution.
173 | 
174 |         Parameters
175 |         ----------
176 |         X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features]
177 |             The input samples.
178 | 
179 |         Returns
180 |         -------
181 |         y : array of shape = [n_samples * 2 * n_classes] for classification problems
182 |             or [n_samples * 2 * 4] for regression problems.
183 |         """
184 | 
185 |         # input transformation and checks
186 |         X, _ = self._normalize_data_and_feature_names(X)
187 |         self._ensure_is_fitted(X)
188 | 
189 |         indices = np.arange(X.shape[0])
190 |         raw_leaf_data = [None] * X.shape[0]
191 |         self._predict(X, indices, None, raw_leaf_data)
192 | 
193 |         return np.array(raw_leaf_data)
194 | 
195 |     @abstractmethod
196 |     def _update_feature_importance(self, feature_importance):
197 |         raise NotImplementedError
198 | 
199 |     @staticmethod
200 |     def _normalize_data_and_feature_names(X, feature_names=None):
201 |         if isinstance(X, pd.DataFrame):
202 |             if feature_names is None:
203 |                 feature_names = X.columns
204 | 
205 |             X = X.values
206 |         else:
207 |             if isinstance(X, list):
208 |                 X = np.array(X)
209 |             elif np.isscalar(X):
210 |                 X = np.array([X])
211 | 
212 |             if X.ndim == 1:
213 |                 X = np.expand_dims(X, 0)
214 | 
215 |             if feature_names is None:
216 |                 feature_names = ['x{}'.format(i) for i in range(X.shape[1])]
217 | 
218 |         X = BaseTree._ensure_float64(X)
219 | 
220 |         if X.ndim != 2:
221 |             raise ValueError('X should have 2 dimensions but has {}'.format(X.ndim))
222 | 
223 |         return X, feature_names
224 | 
225 |     @staticmethod
226 |     def _ensure_float64(data):
227 |         if data.dtype in (
228 |                 np.int8, np.int16, np.int32, np.int64,
229 |                 np.uint8, np.uint16, np.uint32, np.uint64,
230 |                 np.float32, np.float64):
231 |             return data
232 | 
233 |         # convert to np.float64 for performance reasons (matrices with floats but of type object are very slow)
234 |         X_float = data.astype(np.float64)
235 |         if not np.all(data == X_float):
236 |             raise ValueError('Cannot convert data matrix to np.float64 without loss of precision. Please check your data.')
237 | 
238 |         return X_float
239 | 
240 |     def _ensure_is_fitted(self, X=None):
241 |         if not self.is_fitted():
242 |             raise ValueError('Cannot predict on an untrained model; call .fit() first')
243 | 
244 |         if X is not None and X.shape[1] != self.n_dim_:
245 |             raise ValueError('Bad input dimensions: Expected {}, got {}'.format(self.n_dim_, X.shape[1]))
246 | 
247 |     def is_fitted(self):
248 |         return hasattr(self, 'posterior_')
249 | 
250 |     def get_depth(self):
251 |         """Computes and returns the tree depth.
252 | 
253 |         Returns
254 |         -------
255 |         depth : int
256 |             The tree depth.
257 |         """
258 | 
259 |         return self._update_depth(0)
260 | 
261 |     def get_n_leaves(self):
262 |         """Computes and returns the total number of leaves of this tree.
263 | 
264 |         Returns
265 |         -------
266 |         n_leaves : int
267 |             The number of leaves.
268 |         """
269 | 
270 |         return self._update_n_leaves(0)
271 | 
272 |     def _update_depth(self, depth):
273 |         if self.is_leaf():
274 |             return max(depth, self.level)
275 |         else:
276 |             if self.child1_ is not None:
277 |                 depth = self.child1_._update_depth(depth)
278 |                 depth = self.child2_._update_depth(depth)
279 | 
280 |             return depth
281 | 
282 |     def _update_n_leaves(self, n_leaves):
283 |         if self.is_leaf():
284 |             return n_leaves+1
285 |         else:
286 |             if self.child1_ is not None:
287 |                 n_leaves = self.child1_._update_n_leaves(n_leaves)
288 |                 n_leaves = self.child2_._update_n_leaves(n_leaves)
289 | 
290 |             return n_leaves
291 | 
292 |     def _erase_split_info_base(self):
293 |         self.child1_ = None
294 |         self.child2_ = None
295 |         self.log_p_data_no_split_ = None
296 |         self.best_log_p_data_split_ = None
297 | 
298 |     @abstractmethod
299 |     def _get_prior(self, n_data, n_dim):
300 |         raise NotImplementedError
301 | 
302 |     @abstractmethod
303 |     def _erase_split_info(self):
304 |         raise NotImplementedError
305 | 
306 |     @abstractmethod
307 |     def is_leaf(self):
308 |         raise NotImplementedError
309 | 
310 |     @abstractmethod
311 |     def _check_target(self, y):
312 |         raise NotImplementedError
313 | 
314 |     @abstractmethod
315 |     def _compute_log_p_data_no_split(self, y, prior):
316 |         raise NotImplementedError
317 | 
318 |     @abstractmethod
319 |     def _compute_log_p_data_split(self, y, prior, n_dim, split_indices):
320 |         raise NotImplementedError
321 | 
322 |     @abstractmethod
323 |     def _compute_posterior(self, y, prior, delta=1):
324 |         raise NotImplementedError
325 | 
326 |     @abstractmethod
327 |     def _compute_posterior_mean(self):
328 |         raise NotImplementedError
329 | 
330 |     @abstractmethod
331 |     def _predict_leaf(self):
332 |         raise NotImplementedError
333 | 
334 |     @abstractmethod
335 |     def _get_raw_leaf_data_internal(self):
336 |         raise NotImplementedError
337 | 
338 |     @abstractmethod
339 |     def _fit(self, X, y, verbose, feature_names, side_name):
340 |         raise NotImplementedError
341 | 
342 |     def __repr__(self):
343 |         return self.__str__()
344 | 
345 |     @abstractmethod
346 |     def _compute_child1_and_child2_indices(self, X, indices, dense):
347 |         raise NotImplementedError
348 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/base_hyperplane.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from abc import ABC
  3 | from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
  4 | from scipy.sparse import csc_matrix, csr_matrix
  5 | 
  6 | from bayesian_decision_tree.base import BaseTree
  7 | from bayesian_decision_tree.hyperplane_optimization import HyperplaneOptimizationFunction, ScipyOptimizer
  8 | 
  9 | 
 10 | class BaseHyperplaneTree(BaseTree, ABC):
 11 |     """
 12 |     Abstract base class of all Bayesian decision tree models using arbitrarily-oriented hyperplane splits
 13 |     (classification and regression). Performs medium-level fitting and prediction tasks and outsources
 14 |     the low-level work to subclasses.
 15 |     """
 16 | 
 17 |     def __init__(self, partition_prior, prior, delta, prune, child_type, is_regression, optimizer, split_precision, level):
 18 |         BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level)
 19 | 
 20 |         self.optimizer = optimizer
 21 | 
 22 |     def _fit(self, X, y, verbose, feature_names, side_name):
 23 |         n_data = X.shape[0]
 24 |         n_dim = X.shape[1]
 25 |         prior = self._get_prior(n_data, n_dim)
 26 | 
 27 |         if verbose:
 28 |             name = 'level {} {}'.format(self.level, side_name)
 29 |             print('Training {} with {:10} data points'.format(name, n_data))
 30 | 
 31 |         dense = isinstance(X, np.ndarray)
 32 |         if not dense and isinstance(X, csr_matrix):
 33 |             # column accesses coming up, so convert to CSC sparse matrix format
 34 |             X = csc_matrix(X)
 35 | 
 36 |         log_p_data_no_split = self._compute_log_p_data_no_split(y, prior)
 37 | 
 38 |         optimizer = self.optimizer
 39 |         if optimizer is None:
 40 |             # default to 'Differential Evolution' which works well and is reasonably fast
 41 |             optimizer = ScipyOptimizer(DifferentialEvolutionSolver, 666)
 42 | 
 43 |         # the function to optimize (depends on X and y, hence we need to instantiate it for every data set anew)
 44 |         optimization_function = HyperplaneOptimizationFunction(
 45 |             X,
 46 |             y,
 47 |             prior,
 48 |             self._compute_log_p_data_split,
 49 |             log_p_data_no_split,
 50 |             optimizer.search_space_is_unit_hypercube,
 51 |             self.split_precision)
 52 | 
 53 |         # create and run optimizer
 54 |         optimizer.solve(optimization_function)
 55 | 
 56 |         self.optimization_function = optimization_function
 57 | 
 58 |         # retrieve best hyperplane split from optimization function
 59 |         self._erase_split_info_base()
 60 |         self._erase_split_info()
 61 |         if optimization_function.best_hyperplane_normal is not None:
 62 |             # split data and target to recursively train children
 63 |             projections = X @ optimization_function.best_hyperplane_normal \
 64 |                           - np.dot(optimization_function.best_hyperplane_normal, optimization_function.best_hyperplane_origin)
 65 |             indices1 = np.where(projections < 0)[0]
 66 |             indices2 = np.where(projections >= 0)[0]
 67 | 
 68 |             if len(indices1) > 0 and len(indices2) > 0:
 69 |                 """
 70 |                 Note: The reason why indices1 or indices2 could be empty is that the optimizer might find a
 71 |                 'split' that puts all data one one side and nothing on the other side, and that 'split' has
 72 |                 a higher log probability than 'log_p_data_no_split' because of the partition prior
 73 |                 overwhelming the data likelihoods (which are of course identical between the 'all data' and
 74 |                 the 'everything on one side split' scenarios)s.
 75 |                 """
 76 |                 X1 = X[indices1]
 77 |                 X2 = X[indices2]
 78 |                 y1 = y[indices1]
 79 |                 y2 = y[indices2]
 80 | 
 81 |                 n_data1 = X1.shape[0]
 82 |                 n_data2 = X2.shape[0]
 83 | 
 84 |                 # compute posteriors of children and priors for further splitting
 85 |                 prior_child1 = self._compute_posterior(y1, prior, delta=0)
 86 |                 prior_child2 = self._compute_posterior(y2, prior, delta=0)
 87 | 
 88 |                 # store split info, create children and continue training them if there's data left to split
 89 |                 self.best_hyperplane_normal_ = optimization_function.best_hyperplane_normal
 90 |                 self.best_hyperplane_origin_ = optimization_function.best_hyperplane_origin
 91 | 
 92 |                 self.log_p_data_no_split_ = optimization_function.log_p_data_no_split
 93 |                 self.best_log_p_data_split_ = optimization_function.best_log_p_data_split
 94 | 
 95 |                 self.child1_ = self.child_type(self.partition_prior, prior_child1, self.delta,
 96 |                                                self.prune, optimizer, self.split_precision, self.level+1)
 97 |                 self.child2_ = self.child_type(self.partition_prior, prior_child2, self.delta,
 98 |                                                self.prune, optimizer, self.split_precision, self.level+1)
 99 |                 self.child1_._erase_split_info_base()
100 |                 self.child2_._erase_split_info_base()
101 |                 self.child1_._erase_split_info()
102 |                 self.child2_._erase_split_info()
103 | 
104 |                 # fit children if there is more than one data point (i.e., there is
105 |                 # something to split) and if the targets differ (no point otherwise)
106 |                 if n_data1 > 1 and len(np.unique(y1)) > 1:
107 |                     self.child1_._fit(X1, y1, verbose, feature_names, 'back ')
108 |                 else:
109 |                     self.child1_.posterior_ = self._compute_posterior(y1, prior)
110 |                     self.child1_.n_data_ = n_data1
111 | 
112 |                 if n_data2 > 1 and len(np.unique(y2)) > 1:
113 |                     self.child2_._fit(X2, y2, verbose, feature_names, 'front')
114 |                 else:
115 |                     self.child2_.posterior_ = self._compute_posterior(y2, prior)
116 |                     self.child2_.n_data_ = n_data2
117 | 
118 |         # compute posterior
119 |         self.n_dim_ = X.shape[1]
120 |         self.n_data_ = n_data
121 |         self.posterior_ = self._compute_posterior(y, prior)
122 | 
123 |     def _compute_child1_and_child2_indices(self, X, indices, dense):
124 |         projections = X[indices] @ self.best_hyperplane_normal_ - np.dot(self.best_hyperplane_normal_, self.best_hyperplane_origin_)
125 |         indices1 = np.where(projections < 0)[0]
126 |         indices2 = np.where(projections >= 0)[0]
127 | 
128 |         return indices1, indices2
129 | 
130 |     def is_leaf(self):
131 |         self._ensure_is_fitted()
132 |         return self.best_hyperplane_normal_ is None
133 | 
134 |     def feature_importance(self):
135 |         self._ensure_is_fitted()
136 | 
137 |         feature_importance = np.zeros(self.n_dim_)
138 |         self._update_feature_importance(feature_importance)
139 |         feature_importance /= feature_importance.sum()
140 | 
141 |         return feature_importance
142 | 
143 |     def _update_feature_importance(self, feature_importance):
144 |         if self.is_leaf():
145 |             return
146 |         else:
147 |             log_p_gain = self.best_log_p_data_split_ - self.log_p_data_no_split_
148 |             hyperplane_normal = self.best_hyperplane_normal_
149 | 
150 |             # the more the normal vector is oriented along a given dimension's axis the more
151 |             # important that dimension is, so weight log_p_gain with hyperplane_normal[i_dim]
152 |             # (its absolute value in fact because the sign of the direction is irrelevant)
153 |             feature_importance += log_p_gain * np.abs(hyperplane_normal)
154 |             if self.child1_ is not None:
155 |                 self.child1_._update_feature_importance(feature_importance)
156 |                 self.child2_._update_feature_importance(feature_importance)
157 | 
158 |     def _erase_split_info(self):
159 |         self.best_hyperplane_normal_ = None
160 |         self.best_hyperplane_origin_ = None
161 | 
162 |     def __str__(self):
163 |         if not self.is_fitted():
164 |             return 'Unfitted model'
165 | 
166 |         return self._str([], '\u251C', '\u2514', '\u2502', '\u2265', None)
167 | 
168 |     def _str(self, anchor, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, is_back_child):
169 |         anchor_str = ''.join(' ' + a for a in anchor)
170 |         s = ''
171 |         if is_back_child is not None:
172 |             s += anchor_str + ' {:5s}: '.format('back' if is_back_child else 'front')
173 | 
174 |         if self.is_leaf():
175 |             s += 'y={}, n={}'.format(self._predict_leaf(), self.n_data_)
176 |             if not self.is_regression:
177 |                 s += ', p(y)={}'.format(self._compute_posterior_mean())
178 |         else:
179 |             s += 'HP(origin={}, normal={})'.format(self.best_hyperplane_origin_, self.best_hyperplane_normal_)
180 | 
181 |             # 'back' child (the child that is on the side of the hyperplane opposite to the normal vector, or projection < 0)
182 |             s += '\n'
183 |             anchor_child1 = [VERT_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_back_child else '  '), VERT_RIGHT])
184 |             s += self.child1_._str(anchor_child1, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, True)
185 | 
186 |             # 'front' child (the child that is on same side of the hyperplane as the normal vector, or projection >= 0)
187 |             s += '\n'
188 |             anchor_child2 = [DOWN_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_back_child else '  '), DOWN_RIGHT])
189 |             s += self.child2_._str(anchor_child2, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, False)
190 |         return s
191 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/base_perpendicular.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from abc import ABC
  3 | from scipy.sparse import csr_matrix, csc_matrix
  4 | 
  5 | from bayesian_decision_tree.base import BaseTree
  6 | 
  7 | 
  8 | class BasePerpendicularTree(BaseTree, ABC):
  9 |     """
 10 |     Abstract base class of all Bayesian tree models using splits perpendicular to a single feature axis
 11 |     (classification and regression). Performs  medium-level fitting and prediction tasks and outsources
 12 |     the low-level work to subclasses.
 13 |     """
 14 | 
 15 |     def __init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level):
 16 |         BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level)
 17 | 
 18 |     def prediction_paths(self, X):
 19 |         """Returns the prediction paths for X.
 20 | 
 21 |         Parameters
 22 |         ----------
 23 |         X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame,
 24 |             shape = [n_samples, n_features]
 25 | 
 26 |             The input samples.
 27 | 
 28 |         Returns
 29 |         -------
 30 |         prediction_paths : array-like, shape = [n_samples, 4]
 31 | 
 32 |             The prediction paths, each row containing the following fields:
 33 |             split dimension, split feature name, split value, True if greater than the split value and False otherwise
 34 |         """
 35 | 
 36 |         # input transformation and checks
 37 |         X, _ = self._normalize_data_and_feature_names(X)
 38 |         self._ensure_is_fitted(X)
 39 | 
 40 |         paths = [[] for i in range(X.shape[0])]
 41 |         self._update_prediction_paths(X, np.arange(X.shape[0]), paths)
 42 | 
 43 |         return paths
 44 | 
 45 |     def _update_prediction_paths(self, X, indices, paths):
 46 |         if not self.is_leaf():
 47 |             dense = isinstance(X, np.ndarray)
 48 |             if not dense and isinstance(X, csr_matrix):
 49 |                 # column accesses coming up, so convert to CSC sparse matrix format
 50 |                 X = csc_matrix(X)
 51 | 
 52 |             indices1, indices2 = self._compute_child1_and_child2_indices(X, indices, dense)
 53 | 
 54 |             if len(indices1) > 0:
 55 |                 step = (self.split_dimension_, self.split_feature_name_, self.split_value_, False)
 56 |                 for i in indices1:
 57 |                     paths[i].append(step)
 58 | 
 59 |             if len(indices2) > 0:
 60 |                 step = (self.split_dimension_, self.split_feature_name_, self.split_value_, True)
 61 |                 for i in indices2:
 62 |                     paths[i].append(step)
 63 | 
 64 |             if len(indices1) > 0 and not self.child1_.is_leaf():
 65 |                 paths1 = [paths[i] for i in indices1]
 66 |                 self.child1_._update_prediction_paths(X, indices1, paths1)
 67 | 
 68 |             if len(indices2) > 0 and not self.child2_.is_leaf():
 69 |                 paths2 = [paths[i] for i in indices2]
 70 |                 self.child2_._update_prediction_paths(X, indices2, paths2)
 71 | 
 72 |     @staticmethod
 73 |     def _create_merged_paths_array(n_rows):
 74 |         return np.zeros((n_rows, 4))
 75 | 
 76 |     def _fit(self, X, y, verbose, feature_names, side_name, sort_indices_by_dim=None):
 77 |         n_data = sort_indices_by_dim.shape[1] if sort_indices_by_dim is not None else X.shape[0]
 78 | 
 79 |         if verbose:
 80 |             name = 'level {} {}'.format(self.level, side_name)
 81 |             print('Training {} with {:10} data points'.format(name, n_data))
 82 | 
 83 |         dense = isinstance(X, np.ndarray)
 84 |         if not dense and isinstance(X, csr_matrix):
 85 |             # column accesses coming up, so convert to CSC sparse matrix format
 86 |             X = csc_matrix(X)
 87 | 
 88 |         n_dim = X.shape[1]
 89 | 
 90 |         # compute sort indices (only done once at the start)
 91 |         if sort_indices_by_dim is None:
 92 |             dtype = np.uint16 if n_data < (1 << 16) else np.uint32 if n_data < (1 << 32) else np.uint64
 93 |             sort_indices_by_dim = np.zeros(X.shape[::-1], dtype=dtype)
 94 |             for dim in range(n_dim):
 95 |                 X_dim = X[:, dim]
 96 |                 if not dense:
 97 |                     X_dim = self._to_array(X_dim)
 98 | 
 99 |                 sort_indices_by_dim[dim] = np.argsort(X_dim)
100 | 
101 |         # compute data likelihood of not splitting and remember it as the best option so far
102 |         prior = self._get_prior(n_data, n_dim)
103 |         y_any = y[sort_indices_by_dim[0]]  # any dim works as the order doesn't matter
104 |         log_p_data_no_split = self._compute_log_p_data_no_split(y_any, prior)
105 |         best_log_p_data_split = log_p_data_no_split
106 | 
107 |         # compute data likelihoods of all possible splits along all data dimensions
108 |         best_split_index = -1
109 |         best_split_dimension = -1
110 |         for dim in range(n_dim):
111 |             sort_indices = sort_indices_by_dim[dim]
112 |             X_dim_sorted = X[sort_indices, dim]
113 |             if not dense:
114 |                 X_dim_sorted = self._to_array(X_dim_sorted)
115 | 
116 |             split_indices = 1 + np.where(np.abs(np.diff(X_dim_sorted)) > self.split_precision)[0]  # we can only split between *different* data points
117 |             if len(split_indices) == 0:
118 |                 # no split possible along this dimension
119 |                 continue
120 | 
121 |             y_sorted = y[sort_indices]
122 | 
123 |             # compute data likelihoods of all possible splits along this dimension and find split with highest data likelihood
124 |             log_p_data_split = self._compute_log_p_data_split(y_sorted, prior, n_dim, split_indices)
125 |             i_max = log_p_data_split.argmax()
126 |             if log_p_data_split[i_max] > best_log_p_data_split:
127 |                 # remember new best split
128 |                 best_log_p_data_split = log_p_data_split[i_max]
129 |                 best_split_index = split_indices[i_max]  # data index of best split
130 |                 best_split_dimension = dim
131 | 
132 |         # did we find a split that has a higher likelihood than the no-split likelihood?
133 |         if best_split_index > 0:
134 |             # split data and target to recursively train children
135 |             indices1 = sort_indices_by_dim[best_split_dimension, :best_split_index]
136 |             indices2 = sort_indices_by_dim[best_split_dimension, best_split_index:]
137 | 
138 |             # compute posteriors of children and priors for further splitting
139 |             prior = self._get_prior(n_data, n_dim)
140 |             prior_child1 = tuple(self._compute_posterior(y[indices1], prior, self.delta)) if self.delta != 0 else prior
141 |             prior_child2 = tuple(self._compute_posterior(y[indices2], prior, self.delta)) if self.delta != 0 else prior
142 | 
143 |             # store split info, create children and continue training them if there's data left to split
144 |             self.split_dimension_ = best_split_dimension
145 |             self.split_feature_name_ = feature_names[best_split_dimension]
146 |             self.split_value_ = 0.5 * (
147 |                     X[indices1[-1], best_split_dimension]
148 |                     + X[indices2[0], best_split_dimension]
149 |             )
150 |             self.log_p_data_no_split_ = log_p_data_no_split
151 |             self.best_log_p_data_split_ = best_log_p_data_split
152 | 
153 |             self.child1_ = self.child_type(self.partition_prior, prior_child1, self.delta,
154 |                                            self.prune, self.split_precision, self.level+1)
155 |             self.child2_ = self.child_type(self.partition_prior, prior_child2, self.delta,
156 |                                            self.prune, self.split_precision, self.level+1)
157 |             self.child1_._erase_split_info_base()
158 |             self.child2_._erase_split_info_base()
159 |             self.child1_._erase_split_info()
160 |             self.child2_._erase_split_info()
161 | 
162 |             # fit children if there is more than one data point (i.e., there is
163 |             # something to split) and if the targets differ (no point otherwise)
164 |             sort_indices_by_dim_1 = sort_indices_by_dim[np.isin(sort_indices_by_dim, indices1)].reshape(n_dim, -1)
165 |             sort_indices_by_dim_2 = sort_indices_by_dim[np.isin(sort_indices_by_dim, indices2)].reshape(n_dim, -1)
166 |             del sort_indices_by_dim  # help GC maybe?
167 |             n_data1 = sort_indices_by_dim_1.shape[1]
168 |             n_data2 = sort_indices_by_dim_2.shape[1]
169 |             y1 = y[indices1]
170 |             if n_data1 > 1 and len(np.unique(y1)) > 1:
171 |                 self.child1_._fit(X, y, verbose, feature_names, 'LHS', sort_indices_by_dim_1)
172 |             else:
173 |                 self.child1_.posterior_ = self._compute_posterior(y1, prior)
174 |                 self.child1_.n_data_ = n_data1
175 | 
176 |             y2 = y[indices2]
177 |             if n_data2 > 1 and len(np.unique(y2)) > 1:
178 |                 self.child2_._fit(X, y, verbose, feature_names, 'RHS', sort_indices_by_dim_2)
179 |             else:
180 |                 self.child2_.posterior_ = self._compute_posterior(y2, prior)
181 |                 self.child2_.n_data_ = n_data2
182 |         else:
183 |             self._erase_split_info_base()
184 |             self._erase_split_info()
185 | 
186 |         # compute posterior
187 |         self.n_dim_ = n_dim
188 |         self.n_data_ = n_data
189 |         self.posterior_ = self._compute_posterior(y_any, prior)  # any dim works as the order doesn't matter
190 | 
191 |     def _compute_child1_and_child2_indices(self, X, indices, dense):
192 |         X_split = X[indices, self.split_dimension_]
193 |         if not dense:
194 |             X_split = self._to_array(X_split)
195 | 
196 |         indices1 = np.where(X_split < self.split_value_)[0]
197 |         indices2 = np.where(X_split >= self.split_value_)[0]
198 | 
199 |         return indices1, indices2
200 | 
201 |     def is_leaf(self):
202 |         self._ensure_is_fitted()
203 |         return self.split_value_ is None
204 | 
205 |     def _update_feature_importance(self, feature_importance):
206 |         if self.is_leaf():
207 |             return
208 |         else:
209 |             log_p_gain = self.best_log_p_data_split_ - self.log_p_data_no_split_
210 |             feature_importance[self.split_dimension_] += log_p_gain
211 |             if self.child1_ is not None:
212 |                 self.child1_._update_feature_importance(feature_importance)
213 |                 self.child2_._update_feature_importance(feature_importance)
214 | 
215 |     def _erase_split_info(self):
216 |         self.split_dimension_ = -1
217 |         self.split_value_ = None
218 |         self.split_feature_name_ = None
219 | 
220 |     @staticmethod
221 |     def _to_array(sparse_array):
222 |         array = sparse_array.toarray()
223 |         return array[0] if array.shape == (1, 1) else array.squeeze()
224 | 
225 |     def __str__(self):
226 |         if not self.is_fitted():
227 |             return 'Unfitted model'
228 | 
229 |         return self._str([], self.split_value_, '\u251C', '\u2514', '\u2502', '\u2265', None)
230 | 
231 |     def _str(self, anchor, parent_split_value, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, is_left_child):
232 |         anchor_str = ''.join(' ' + a for a in anchor)
233 |         s = ''
234 |         if is_left_child is not None:
235 |             s += anchor_str + ' {}{}: '.format('<' if is_left_child else GEQ, parent_split_value)
236 | 
237 |         if self.is_leaf():
238 |             s += 'y={}, n={}'.format(self._predict_leaf(), self.n_data_)
239 |             if not self.is_regression:
240 |                 s += ', p(y)={}'.format(self._compute_posterior_mean())
241 |         else:
242 |             s += '{}={}'.format(self.split_feature_name_, self.split_value_)
243 | 
244 |             s += '\n'
245 |             anchor_child1 = [VERT_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_left_child else '  '), VERT_RIGHT])
246 |             s += self.child1_._str(anchor_child1, self.split_value_, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, True)
247 | 
248 |             s += '\n'
249 |             anchor_child2 = [DOWN_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_left_child else '  '), DOWN_RIGHT])
250 |             s += self.child2_._str(anchor_child2, self.split_value_, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, False)
251 |         return s
252 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/classification.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module declares the Bayesian classification tree models:
  3 | * PerpendicularClassificationTree
  4 | * HyperplaneClassificationTree
  5 | """
  6 | import numpy as np
  7 | from abc import ABC
  8 | from sklearn.base import ClassifierMixin
  9 | 
 10 | from bayesian_decision_tree.base import BaseTree
 11 | from bayesian_decision_tree.base_hyperplane import BaseHyperplaneTree
 12 | from bayesian_decision_tree.base_perpendicular import BasePerpendicularTree
 13 | from bayesian_decision_tree.utils import multivariate_betaln
 14 | 
 15 | 
 16 | class BaseClassificationTree(BaseTree, ABC, ClassifierMixin):
 17 |     """
 18 |     Abstract base class of all Bayesian classification trees (perpendicular and hyperplane). Performs
 19 |     medium-level fitting and prediction tasks and outsources the low-level work to subclasses.
 20 |     """
 21 | 
 22 |     def __init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level=0):
 23 |         BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, split_precision, level)
 24 | 
 25 |     def predict_proba(self, X):
 26 |         """Predict class probabilities of the input samples X.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features]
 31 |             The input samples.
 32 | 
 33 |         Returns
 34 |         -------
 35 |         p : array of shape = [n_samples, n_classes]
 36 |             The class probabilities of the input samples.
 37 |         """
 38 | 
 39 |         # input transformation and checks
 40 |         X, _ = self._normalize_data_and_feature_names(X)
 41 |         self._ensure_is_fitted(X)
 42 | 
 43 |         y_proba = [None] * X.shape[0]
 44 |         self._predict(X, np.arange(X.shape[0]), False, y_proba)
 45 | 
 46 |         return np.array(y_proba)
 47 | 
 48 |     def _check_target(self, y):
 49 |         if y.ndim != 1:
 50 |             raise ValueError('y should have 1 dimension but has {}'.format(y.ndim))
 51 | 
 52 |         n_classes = len(self.prior)
 53 |         if not np.all(np.unique(y) == np.arange(0, n_classes)):
 54 |             raise ValueError('Expected target values 0..{} but found {}..{}'.format(n_classes - 1, y.min(), y.max()))
 55 | 
 56 |     def _get_prior(self, n_data, n_dim):
 57 |         if self.prior is not None:
 58 |             return self.prior
 59 |         else:
 60 |             prior_pseudo_observation_count = max(1, n_data//100)
 61 |             return prior_pseudo_observation_count * np.ones(n_dim)
 62 | 
 63 |     def _compute_log_p_data_no_split(self, y, prior):
 64 |         posterior = self._compute_posterior(y, prior)
 65 | 
 66 |         log_p_prior = np.log(1-self.partition_prior**(1+self.level))
 67 |         log_p_data = multivariate_betaln(posterior) - multivariate_betaln(prior)
 68 | 
 69 |         return log_p_prior + log_p_data
 70 | 
 71 |     def _compute_log_p_data_split(self, y, prior, n_dim, split_indices):
 72 |         n_classes = len(prior)
 73 |         k1 = np.empty(n_classes, dtype=object)
 74 |         k2 = np.empty(n_classes, dtype=object)
 75 |         for i in range(n_classes):
 76 |             k1_and_total = (y == i).cumsum()
 77 |             total = k1_and_total[-1]
 78 |             k1[i] = k1_and_total[split_indices-1]
 79 |             k2[i] = total - k1[i]
 80 | 
 81 |         n_splits = len(split_indices)
 82 |         log_p_prior = np.log(self.partition_prior**(1+self.level) / (n_splits * n_dim))
 83 | 
 84 |         betaln_prior = multivariate_betaln(prior)
 85 |         log_p_data1 = self._compute_log_p_data(k1, prior, betaln_prior)
 86 |         log_p_data2 = self._compute_log_p_data(k2, prior, betaln_prior)
 87 | 
 88 |         return log_p_prior + log_p_data1 + log_p_data2
 89 | 
 90 |     def _compute_posterior(self, y, prior, delta=1):
 91 |         if delta == 0:
 92 |             return prior
 93 | 
 94 |         # see https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions
 95 |         y_reshaped = np.broadcast_to(y, (len(prior), len(y)))
 96 |         classes = np.arange(len(prior)).reshape(-1, 1)
 97 |         k = np.sum(y_reshaped == classes, axis=1)
 98 |         posterior = prior + delta*k
 99 | 
100 |         return posterior
101 | 
102 |     def _compute_posterior_mean(self):
103 |         return self.posterior_ / np.sum(self.posterior_)
104 | 
105 |     def _compute_log_p_data(self, k, prior, betaln_prior):
106 |         # see https://www.cs.ubc.ca/~murphyk/Teaching/CS340-Fall06/reading/bernoulli.pdf, equation (42)
107 |         # which can be expressed as a fraction of beta functions
108 |         return multivariate_betaln(prior+k) - betaln_prior
109 | 
110 |     def _predict_leaf(self):
111 |         # predict class
112 |         return np.argmax(self.posterior_)
113 | 
114 |     def _get_raw_leaf_data_internal(self):
115 |         # prior and posterior raw data
116 |         return np.array([self.prior, self.posterior_])
117 | 
118 | 
119 | class PerpendicularClassificationTree(BasePerpendicularTree, BaseClassificationTree):
120 |     """
121 |     Bayesian binary or multiclass classification tree. Uses a Dirichlet prior (a
122 |     multivariate generalization of the Beta prior for more than 2 variables).
123 | 
124 |     Parameters
125 |     ----------
126 |     partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9
127 |         The prior probability of splitting a node's data into two children.
128 | 
129 |         Small values tend to reduce the tree depth, leading to less expressiveness
130 |         but also to less overfitting.
131 | 
132 |         Large values tend to increase the tree depth and thus lead to the tree
133 |         better fitting the data, which can lead to overfitting.
134 | 
135 |     prior : array_like, shape = [number of classes]
136 |         The hyperparameters [alpha_0, alpha_1, ..., alpha_{N-1}] of the Dirichlet
137 |         conjugate prior, see [1] and [2]. All alpha_i must be positive, where
138 |         alpha_i represents the number of prior pseudo-observations of class i.
139 | 
140 |         Small values for alpha_i represent a weak prior which leads to the
141 |         training data dominating the posterior. This can lead to overfitting.
142 | 
143 |         Large values for alpha_i represent a strong prior and thus put less weight
144 |         on the data. This can be used for regularization.
145 | 
146 |     delta : float, default=0.0
147 |         Determines the strengthening of the prior as the tree grows deeper,
148 |         see [1]. Must be a value between 0.0 and 1.0.
149 | 
150 |     prune : boolean, default=False
151 |         Prunes the tree after fitting if `True` by removing all splits that don't add information,
152 |         i.e., where the predictions of both children are identical. It's usually sensible to set
153 |         this to `True` in the classification case if you're only interested in class predictions
154 |         (`predict(X)`), but it makes sense to set it to `False` if you're looking for class
155 |         probabilities (`predict_proba(X)`). It can safely be set to 'True' in the regression case
156 |         because it will only merge children if their predictions are identical.
157 | 
158 |     split_precision : float, default=0.0
159 |         Determines the minimum distance between two contiguous points to consider a split. If the distance is below
160 |         this threshold, the points are considered to overlap along this direction.
161 | 
162 |     level : DO NOT SET, ONLY USED BY SUBCLASSES
163 | 
164 |     See also
165 |     --------
166 |     demo_classification_perpendicular.py
167 |     PerpendicularRegressionTree
168 |     HyperplaneClassificationTree
169 | 
170 |     References
171 |     ----------
172 | 
173 |     .. [1] https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical/multinomial
174 | 
175 |     .. [2] https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions
176 | 
177 |     Examples
178 |     --------
179 |     See `demo_classification_perpendicular.py`.
180 |     """
181 | 
182 |     def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, split_precision=0.0, level=0):
183 |         child_type = PerpendicularClassificationTree
184 |         BasePerpendicularTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, split_precision, level)
185 |         BaseClassificationTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level)
186 | 
187 | 
188 | class HyperplaneClassificationTree(BaseHyperplaneTree, BaseClassificationTree):
189 |     """
190 |     Bayesian binary or multiclass classification tree using arbitrarily-oriented
191 |     hyperplane splits. Uses a Dirichlet prior (a multivariate generalization
192 |     of the Beta prior for more than 2 variables).
193 | 
194 |     Parameters
195 |     ----------
196 |     partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9
197 |         The prior probability of splitting a node's data into two children.
198 | 
199 |         Small values tend to reduce the tree depth, leading to less expressiveness
200 |         but also to less overfitting.
201 | 
202 |         Large values tend to increase the tree depth and thus lead to the tree
203 |         better fitting the data, which can lead to overfitting.
204 | 
205 |     prior : array_like, shape = [number of classes]
206 |         The hyperparameters [alpha_0, alpha_1, ..., alpha_{N-1}] of the Dirichlet
207 |         conjugate prior, see [1] and [2]. All alpha_i must be positive, where
208 |         alpha_i represents the number of prior pseudo-observations of class i.
209 | 
210 |         Small values for alpha_i represent a weak prior which leads to the
211 |         training data dominating the posterior. This can lead to overfitting.
212 | 
213 |         Large values for alpha_i represent a strong prior and thus put less weight
214 |         on the data. This can be used for regularization.
215 | 
216 |     delta : float, default=0.0
217 |         Determines the strengthening of the prior as the tree grows deeper,
218 |         see [1]. Must be a value between 0.0 and 1.0.
219 | 
220 |     prune : boolean, default=False
221 |         Prunes the tree after fitting if `True` by removing all splits that don't add information,
222 |         i.e., where the predictions of both children are identical. It's usually sensible to set
223 |         this to `True` in the classification case if you're only interested in class predictions
224 |         (`predict(X)`), but it makes sense to set it to `False` if you're looking for class
225 |         probabilities (`predict_proba(X)`). It can safely be set to 'True' in the regression case
226 |         because it will only merge children if their predictions are identical.
227 | 
228 |     optimizer : object
229 |         A global optimization algorithm object that performs optimal hyperparameter
230 |         orientation search. The available options are (in the order in which you should
231 |         try them):
232 |         - ScipyOptimizer: A wrapper around scipy global optimizers. See usages for examples.
233 |         - SimulatedAnnealingOptimizer: Experimental, but works well with n_scan=20, n_keep=10, spread_factor=0.95
234 |         - RandomHyperplaneOptimizer: Experimental, mediocre performance
235 |         - RandomTwoPointOptimizer: Experimental, mediocre performance
236 |         - GradientDescentOptimizer: Experimental, mediocre performance
237 | 
238 |     split_precision : float, default=0.0
239 |         Determines the minimum distance between two contiguous points to consider a split. If the distance is below
240 |         this threshold, the points are considered to overlap along this direction.
241 | 
242 |     level : DO NOT SET, ONLY USED BY SUBCLASSES
243 | 
244 |     See also
245 |     --------
246 |     demo_classification_hyperplane.py
247 |     HyperplaneRegressionTree
248 |     PerpendicularClassificationTree
249 | 
250 |     References
251 |     ----------
252 | 
253 |     .. [1] https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical/multinomial
254 | 
255 |     .. [2] https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions
256 | 
257 |     Examples
258 |     --------
259 |     See `demo_classification_perpendicular.py`.
260 |     """
261 | 
262 |     def __init__(self, partition_prior=0.99, prior=None, delta=None, prune=False, optimizer=None, split_precision=0.0, level=0):
263 |         child_type = HyperplaneClassificationTree
264 |         BaseHyperplaneTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, optimizer, split_precision, level)
265 |         BaseClassificationTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level)
266 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/hyperplane_optimization.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from abc import ABC, abstractmethod
  3 | from numpy.random import RandomState
  4 | from scipy.sparse import csr_matrix, csc_matrix
  5 | 
  6 | from bayesian_decision_tree.utils import r2_series_generator, hypercube_to_hypersphere_surface
  7 | 
  8 | 
  9 | class HyperplaneOptimizationFunction:
 10 |     """
 11 |     The function to optimize for hyperplane trees. This is a function of `n_dim` variables representing
 12 |     the normal vector of a hyperplane in `n_dim` dimensions. Given such a hyperplane normal the function
 13 |     computes the optimum split location (i.e., the origin of the hyperplane) in the data such that the
 14 |     data likelihood is maximized.
 15 |     """
 16 | 
 17 |     def __init__(self, X, y, prior, compute_log_p_data_split, log_p_data_no_split, search_space_is_unit_hypercube, split_precision):
 18 |         self.X = X
 19 |         self.y = y
 20 |         self.prior = prior
 21 |         self.compute_log_p_data_split = compute_log_p_data_split
 22 |         self.log_p_data_no_split = log_p_data_no_split
 23 |         self.search_space_is_unit_hypercube = search_space_is_unit_hypercube
 24 |         self.split_precision = split_precision
 25 | 
 26 |         # results of the optimization - to be set later during the actual optimization
 27 |         self.function_evaluations = 0
 28 |         self.best_log_p_data_split = log_p_data_no_split
 29 |         self.best_cumulative_distances = 0
 30 |         self.best_hyperplane_normal = None
 31 |         self.best_hyperplane_origin = None
 32 | 
 33 |     def compute(self, hyperplane_normal):
 34 |         self.function_evaluations += 1
 35 | 
 36 |         if self.search_space_is_unit_hypercube:
 37 |             hyperplane_normal = hypercube_to_hypersphere_surface(hyperplane_normal, half_hypersphere=True)
 38 | 
 39 |         # catch some special cases and normalize to unit length
 40 |         hyperplane_normal = np.nan_to_num(hyperplane_normal)
 41 |         if np.all(hyperplane_normal == 0):
 42 |             hyperplane_normal[0] = 1
 43 | 
 44 |         hyperplane_normal /= np.linalg.norm(hyperplane_normal)
 45 | 
 46 |         dense = isinstance(self.X, np.ndarray)
 47 |         if not dense and isinstance(self.X, csr_matrix):
 48 |             self.X = csc_matrix(self.X)
 49 | 
 50 |         # compute distance of all points to the hyperplane: https://mathinsight.org/distance_point_plane
 51 |         projections = self.X @ hyperplane_normal  # up to an additive constant which doesn't matter to distance ordering
 52 |         sort_indices = np.argsort(projections)
 53 |         split_indices = 1 + np.where(np.abs(np.diff(projections)) > self.split_precision)[0]  # we can only split between *different* data points
 54 |         if len(split_indices) == 0:
 55 |             # no split possible along this dimension
 56 |             return -self.log_p_data_no_split
 57 | 
 58 |         y_sorted = self.y[sort_indices]
 59 | 
 60 |         # compute data likelihoods of all possible splits along this projection and find split with highest data likelihood
 61 |         n_dim = self.X.shape[1]
 62 |         log_p_data_split = self.compute_log_p_data_split(y_sorted, self.prior, n_dim, split_indices)
 63 |         i_max = log_p_data_split.argmax()
 64 |         if log_p_data_split[i_max] >= self.best_log_p_data_split:
 65 |             best_split_index = split_indices[i_max]
 66 |             p1 = self.X[sort_indices[best_split_index-1]]
 67 |             p2 = self.X[sort_indices[best_split_index]]
 68 |             if not dense:
 69 |                 p1 = p1.toarray()[0]
 70 |                 p2 = p2.toarray()[0]
 71 | 
 72 |             hyperplane_origin = 0.5 * (p1 + p2)  # middle between the points that are being split
 73 |             projections_with_origin = projections - np.dot(hyperplane_normal, hyperplane_origin)
 74 |             cumulative_distances = np.sum(np.abs(projections_with_origin))
 75 | 
 76 |             if log_p_data_split[i_max] > self.best_log_p_data_split:
 77 |                 is_log_p_better_or_same_but_with_better_distance = True
 78 |             else:
 79 |                 # accept new split with same log(p) only if it increases the cumulative distance of all points to the hyperplane
 80 |                 is_log_p_better_or_same_but_with_better_distance = cumulative_distances > self.best_cumulative_distances
 81 | 
 82 |             if is_log_p_better_or_same_but_with_better_distance:
 83 |                 self.best_log_p_data_split = log_p_data_split[i_max]
 84 |                 self.best_cumulative_distances = cumulative_distances
 85 |                 self.best_hyperplane_normal = hyperplane_normal
 86 |                 self.best_hyperplane_origin = hyperplane_origin
 87 | 
 88 |         return -log_p_data_split[i_max]
 89 | 
 90 | 
 91 | class StrMixin:
 92 |     """Auto-generate `__str__()` and `__repr__()` from attributes."""
 93 |     def __str__(self):
 94 |         attributes = ['{}={}'.format(k, v) for k, v in self.__dict__.items()]
 95 |         return '{}[{}]'.format(type(self).__name__, ', '.join(attributes))
 96 | 
 97 |     def __repr__(self):
 98 |         return self.__str__()
 99 | 
100 | 
101 | class HyperplaneOptimizer(ABC, StrMixin):
102 |     """
103 |     Abstract base class of all hyperplane optimizers.
104 |     """
105 | 
106 |     def __init__(self, search_space_is_unit_hypercube):
107 |         self.search_space_is_unit_hypercube = search_space_is_unit_hypercube
108 | 
109 |     @abstractmethod
110 |     def solve(self, optimization_function):
111 |         raise NotImplementedError
112 | 
113 | 
114 | class ScipyOptimizer(HyperplaneOptimizer):
115 |     """An optimizer using one of the scipy global optimizers, see [1].
116 | 
117 |     References
118 |     ----------
119 |     .. [1] https://docs.scipy.org/doc/scipy/reference/optimize.html#global-optimization
120 |     """
121 | 
122 |     def __init__(self, solver_type, seed, **extra_solver_kwargs):
123 |         super().__init__(search_space_is_unit_hypercube=True)
124 | 
125 |         self.solver_type = solver_type
126 |         self.seed = seed
127 |         self.extra_solver_kwargs = extra_solver_kwargs
128 | 
129 |     def solve(self, optimization_function):
130 |         # bounds for scipy optimizers: unit hypercube (will be mapped to
131 |         # (half) hypersphere uniformly later on)
132 |         X = optimization_function.X
133 |         n_dim = X.shape[1]
134 |         unit_hypercube_bounds = np.vstack((np.zeros(n_dim-1), np.ones(n_dim-1))).T
135 | 
136 |         solver = self.solver_type(
137 |             func=optimization_function.compute,
138 |             bounds=unit_hypercube_bounds,
139 |             seed=self.seed,
140 |             **self.extra_solver_kwargs)
141 | 
142 |         solver.solve()
143 | 
144 | 
145 | class RandomTwoPointOptimizer(HyperplaneOptimizer):
146 |     """
147 |     An optimizer randomly choosing two points of different classes to construct
148 |     a bisecting hyperplane (experimental).
149 |     TODO: Complete
150 |     """
151 | 
152 |     def __init__(self, n_mc, seed):
153 |         super().__init__(search_space_is_unit_hypercube=False)
154 | 
155 |         self.n_mc = n_mc
156 |         self.seed = seed
157 | 
158 |     def solve(self, optimization_function):
159 |         rand = RandomState(self.seed)
160 | 
161 |         X = optimization_function.X
162 |         y = optimization_function.y
163 | 
164 |         if np.any(np.round(y) != y):
165 |             raise TypeError('Cannot use {} for regression problems as there are no classes to pick points from'.format(
166 |                 RandomTwoPointOptimizer.__name__))
167 | 
168 |         dense = isinstance(X, np.ndarray)
169 | 
170 |         if len(set(y)) <= 1:
171 |             # can't pick two points of different classes if there aren't at least two classes
172 |             return
173 | 
174 |         # find indices of each class
175 |         n_classes = int(y.max()) + 1
176 |         class_indices = [np.where(y == i)[0] for i in range(n_classes)]
177 | 
178 |         # evaluate 'n_mc' hyperplane normals passing through two random points form different classes
179 |         for i in range(self.n_mc):
180 |             indices1 = []
181 |             indices2 = []
182 | 
183 |             while len(indices1) == 0 or len(indices2) == 0:
184 |                 class1 = rand.randint(0, n_classes)
185 |                 indices1 = class_indices[class1]
186 | 
187 |                 class2 = class1
188 |                 while class2 == class1:
189 |                     class2 = rand.randint(0, n_classes)
190 | 
191 |                 indices2 = class_indices[class2]
192 | 
193 |             p1 = X[indices1[rand.randint(0, len(indices1))]]
194 |             p2 = X[indices2[rand.randint(0, len(indices2))]]
195 |             if not dense:
196 |                 p1 = p1.toarray()[0]
197 |                 p2 = p2.toarray()[0]
198 | 
199 |             normal = p2-p1
200 |             if normal[0] < 0:
201 |                 normal *= -1  # make sure the first coordinate is positive to match the scipy search space
202 | 
203 |             optimization_function.compute(normal)
204 | 
205 | 
206 | class RandomHyperplaneOptimizer(HyperplaneOptimizer):
207 |     """
208 |     An optimizer generating hyperplanes with random orientation
209 |     in space (experimental).
210 |     TODO: Complete
211 |     """
212 | 
213 |     def __init__(self, n_mc, seed):
214 |         super().__init__(search_space_is_unit_hypercube=False)
215 | 
216 |         self.n_mc = n_mc
217 |         self.seed = seed
218 | 
219 |     def solve(self, optimization_function):
220 |         rand = RandomState(self.seed)
221 | 
222 |         X = optimization_function.X
223 |         n_dim = X.shape[1]
224 | 
225 |         for i in range(self.n_mc):
226 |             hyperplane_normal = rand.normal(0, 1, n_dim)
227 |             optimization_function.compute(hyperplane_normal)
228 | 
229 | 
230 | class QuasiRandomHyperplaneOptimizer(HyperplaneOptimizer):
231 |     """
232 |     An optimizer generating hyperplanes with quasi-random orientation
233 |     in space, see
234 |     http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences/
235 |     """
236 | 
237 |     def __init__(self, n):
238 |         super().__init__(search_space_is_unit_hypercube=True)
239 | 
240 |         self.n = n
241 | 
242 |     def solve(self, optimization_function):
243 |         X = optimization_function.X
244 |         n_dim = X.shape[1]
245 |         n_dim_surface = n_dim-1
246 | 
247 |         # quasi-random R2 sequence
248 |         r2gen = r2_series_generator(n_dim_surface)
249 |         for i in range(self.n):
250 |             uniform = next(r2gen)
251 |             optimization_function.compute(uniform)
252 | 
253 | 
254 | class OptunaOptimizer(HyperplaneOptimizer):
255 |     def __init__(self, n_trials, seed):
256 |         super().__init__(search_space_is_unit_hypercube=True)
257 | 
258 |         self.n_trials = n_trials
259 |         self.seed = seed
260 | 
261 |     def solve(self, optimization_function):
262 |         from optuna import create_study
263 |         from optuna.logging import set_verbosity
264 |         from optuna.samplers import TPESampler
265 | 
266 |         study = create_study(direction='minimize', sampler=TPESampler(self.seed))
267 |         n_dim = optimization_function.X.shape[1]
268 |         n_dim_surface = n_dim-1
269 | 
270 |         def objective(trial):
271 |             uniform = np.zeros(n_dim_surface)
272 |             for i in range(n_dim_surface):
273 |                 uniform[i] = trial.suggest_uniform(f'uniform[{i}]', 0, 1)
274 | 
275 |             return optimization_function.compute(uniform)
276 | 
277 |         set_verbosity(0)
278 |         study.optimize(objective, n_trials=self.n_trials)
279 | 
280 | 
281 | class SimulatedAnnealingOptimizer(HyperplaneOptimizer):
282 |     """
283 |     A simple simulated annealing optimizer (experimental).
284 |     TODO: Complete
285 |     """
286 | 
287 |     def __init__(self, n_scan, n_keep, spread_factor, seed):
288 |         super().__init__(search_space_is_unit_hypercube=True)
289 | 
290 |         self.n_scan = n_scan
291 |         self.n_keep = n_keep
292 |         self.spread_factor = spread_factor
293 |         self.seed = seed
294 | 
295 |     def solve(self, optimization_function):
296 |         rand = RandomState(self.seed)
297 | 
298 |         X = optimization_function.X
299 |         n_dim = X.shape[1]-1
300 | 
301 |         candidates = {}
302 | 
303 |         no_improvements = 0
304 |         best_value = np.inf
305 | 
306 |         f = 1
307 |         while no_improvements < 50:
308 |             if len(candidates) == 0:
309 |                 # first run
310 |                 for i in range(self.n_scan):
311 |                     candidate = rand.uniform(0, 1, n_dim)
312 |                     value = optimization_function.compute(candidate)
313 |                     candidates[value] = candidate
314 |             else:
315 |                 # evolution
316 |                 vectors = list(candidates.values())
317 |                 ranges = [np.max([v[i] for v in vectors]) - np.min([v[i] for v in vectors]) for i in range(n_dim)]
318 | 
319 |                 values_sorted = sorted(candidates.keys())
320 |                 best_value = values_sorted[0]
321 |                 for i in range(self.n_keep):
322 |                     i_candidate = i*len(values_sorted)//self.n_keep
323 |                     candidate = candidates[values_sorted[i_candidate]]
324 |                     # perturbation = ranges * rand.uniform(-1, 1, len(ranges))
325 |                     perturbation = f * rand.uniform(-1, 1, len(ranges))
326 |                     new_candidate = candidate + perturbation
327 |                     new_candidate = np.clip(new_candidate, 0, 1)
328 |                     value = optimization_function.compute(new_candidate)
329 |                     candidates[value] = new_candidate
330 | 
331 |                 f *= self.spread_factor
332 | 
333 |             # only keep the best candidates
334 |             values_sorted = sorted(candidates.keys())
335 |             values_sorted = values_sorted[:self.n_keep]
336 |             if values_sorted[0] < best_value:
337 |                 no_improvements = 0
338 |             else:
339 |                 no_improvements += 1
340 | 
341 |             candidates = {v: candidates[v] for v in values_sorted}
342 | 
343 | 
344 | class GradientDescentOptimizer(HyperplaneOptimizer):
345 |     """
346 |     A simple gradient descent optimizer (experimental).
347 |     TODO: Complete
348 |     """
349 | 
350 |     def __init__(self, n_init, n_keep):
351 |         super().__init__(search_space_is_unit_hypercube=True)
352 | 
353 |         self.n_init = n_init
354 |         self.n_keep = n_keep
355 | 
356 |     def solve(self, optimization_function):
357 |         X = optimization_function.X
358 |         n_dim = X.shape[1]-1
359 | 
360 |         rand = RandomState(666)
361 | 
362 |         candidates = {}
363 | 
364 |         no_improvements = 0
365 |         best_value = np.inf
366 | 
367 |         start_delta = 1e-6
368 |         while no_improvements < 3:
369 |             if len(candidates) == 0:
370 |                 # first run
371 |                 for i in range(self.n_init):
372 |                     candidate = rand.uniform(0, 1, n_dim)
373 |                     value = optimization_function.compute(candidate)
374 |                     candidates[value] = candidate
375 |             else:
376 |                 # compute numerical gradient for each of the best vectors
377 |                 values_sorted = sorted(candidates.keys())
378 |                 best_value = values_sorted[0]
379 |                 for i in range(self.n_keep):
380 |                     i_candidate = i*len(values_sorted)//self.n_keep
381 |                     value = values_sorted[i_candidate]
382 |                     candidate = candidates[value]
383 | 
384 |                     gradient = np.zeros(n_dim)
385 |                     delta = start_delta
386 | 
387 |                     while True:
388 |                         delta_too_small = False
389 | 
390 |                         for i_dim in range(n_dim):
391 |                             new_candidate = candidate.copy()
392 |                             new_candidate[i_dim] += delta
393 |                             if new_candidate[i_dim] > 1:
394 |                                 delta *= -1
395 |                                 new_candidate[i_dim] = candidate[i_dim] + delta
396 | 
397 |                             new_value = optimization_function.compute(new_candidate)
398 |                             gradient[i_dim] = (new_value - value) / delta
399 |                             delta = np.abs(delta)
400 |                             if gradient[i_dim] == 0:
401 |                                 delta_too_small = True
402 |                                 break
403 | 
404 |                         if delta_too_small:
405 |                             delta *= 10
406 |                             if delta >= 1:
407 |                                 # can't compute gradient, so give up
408 |                                 break
409 |                         else:
410 |                             break
411 | 
412 |                     if delta_too_small:
413 |                         continue
414 | 
415 |                     start_delta = delta / 10
416 | 
417 |                     # add gradient to vector
418 |                     lambda_ = 1e-6
419 |                     best_new_candidate = candidate
420 |                     best_new_value = value
421 |                     while True:
422 |                         new_candidate = candidate - lambda_ * gradient
423 |                         new_candidate = np.clip(new_candidate, 0, 1)
424 |                         new_value = optimization_function.compute(new_candidate)
425 |                         if new_value < best_new_value:
426 |                             lambda_ *= 2
427 |                             best_new_candidate = new_candidate
428 |                             best_new_value = new_value
429 |                         else:
430 |                             break
431 | 
432 |                     candidates[best_new_value] = best_new_candidate
433 | 
434 |             # only keep the best candidates
435 |             values_sorted = sorted(candidates.keys())
436 |             values_sorted = values_sorted[:self.n_keep]
437 |             if values_sorted[0] < best_value:
438 |                 no_improvements = 0
439 |             else:
440 |                 no_improvements += 1
441 | 
442 |             candidates = {v: candidates[v] for v in values_sorted}
443 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module declares the Bayesian regression tree models:
  3 | * PerpendicularRegressionTree
  4 | * HyperplaneRegressionTree
  5 | """
  6 | import numpy as np
  7 | from abc import ABC
  8 | from scipy.special import gammaln
  9 | from sklearn.base import RegressorMixin
 10 | 
 11 | from bayesian_decision_tree.base import BaseTree
 12 | from bayesian_decision_tree.base_hyperplane import BaseHyperplaneTree
 13 | from bayesian_decision_tree.base_perpendicular import BasePerpendicularTree
 14 | 
 15 | 
 16 | class BaseRegressionTree(BaseTree, ABC, RegressorMixin):
 17 |     """
 18 |     Abstract base class of all Bayesian regression trees (perpendicular and hyperplane). Performs
 19 |     medium-level fitting and prediction tasks and outsources the low-level work to subclasses.
 20 |     """
 21 | 
 22 |     def __init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level=0):
 23 |         BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, True, split_precision, level)
 24 | 
 25 |     def _check_target(self, y):
 26 |         if y.ndim != 1:
 27 |             raise ValueError('y should have 1 dimension but has {}'.format(y.ndim))
 28 | 
 29 |     def _compute_log_p_data_no_split(self, y, prior):
 30 |         y_sum = y.sum()
 31 |         y_squared_sum = (y ** 2).sum()
 32 |         n = len(y)
 33 |         mu_post, kappa_post, alpha_post, beta_post = self._compute_posterior_internal(prior, n, y_sum, y_squared_sum)
 34 |         log_p_prior = np.log(1 - self.partition_prior**(1 + self.level))
 35 |         log_p_data = self._compute_log_p_data(prior, alpha_post, beta_post, kappa_post, n)
 36 | 
 37 |         return log_p_prior + log_p_data
 38 | 
 39 |     def _compute_log_p_data_split(self, y, prior, n_dim, split_indices):
 40 |         n = len(y)
 41 |         n1 = np.arange(1, n)
 42 |         n2 = n - n1
 43 |         y_sum1 = y.cumsum()[:-1]
 44 |         y_sum2 = y.sum() - y_sum1
 45 |         y_squared_sum1 = (y[:-1] ** 2).cumsum()
 46 |         y_squared_sum2 = (y ** 2).sum() - y_squared_sum1
 47 | 
 48 |         if len(split_indices) != len(y)-1:
 49 |             # we are *not* splitting between all data points -> indexing necessary
 50 |             split_indices_minus_1 = split_indices - 1
 51 | 
 52 |             n1 = n1[split_indices_minus_1]
 53 |             n2 = n2[split_indices_minus_1]
 54 |             y_sum1 = y_sum1[split_indices_minus_1]
 55 |             y_sum2 = y_sum2[split_indices_minus_1]
 56 |             y_squared_sum1 = y_squared_sum1[split_indices_minus_1]
 57 |             y_squared_sum2 = y_squared_sum2[split_indices_minus_1]
 58 | 
 59 |         mu1, kappa1, alpha1, beta1 = self._compute_posterior_internal(prior, n1, y_sum1, y_squared_sum1)
 60 |         mu2, kappa2, alpha2, beta2 = self._compute_posterior_internal(prior, n2, y_sum2, y_squared_sum2)
 61 | 
 62 |         n_splits = len(split_indices)
 63 |         log_p_prior = np.log(self.partition_prior**(1+self.level) / (n_splits * n_dim))
 64 | 
 65 |         log_p_data1 = self._compute_log_p_data(prior, alpha1, beta1, kappa1, n1)
 66 |         log_p_data2 = self._compute_log_p_data(prior, alpha2, beta2, kappa2, n2)
 67 | 
 68 |         return log_p_prior + log_p_data1 + log_p_data2
 69 | 
 70 |     def _get_prior(self, n_data, n_dim):
 71 |         if self.prior is not None:
 72 |             return self.prior
 73 |         else:
 74 |             # TODO: use actual data to compute mu and tau
 75 |             prior_pseudo_observation_count = max(1, n_data//100)
 76 |             mu = 0
 77 |             tau = 1
 78 |             kappa = prior_pseudo_observation_count
 79 |             alpha = prior_pseudo_observation_count/2
 80 |             beta = alpha/tau
 81 |             return np.array([mu, kappa, alpha, beta])
 82 | 
 83 |     def _compute_posterior(self, y, prior, delta=1):
 84 |         if delta == 0:
 85 |             return prior
 86 | 
 87 |         n = len(y)
 88 |         y_sum = y.sum()
 89 |         y_squared_sum = (y ** 2).sum()
 90 | 
 91 |         return self._compute_posterior_internal(prior, n, y_sum, y_squared_sum, delta)
 92 | 
 93 |     def _compute_posterior_internal(self, prior, n, y_sum, y_squared_sum, delta=1):
 94 |         mu, kappa, alpha, beta = prior
 95 | 
 96 |         # see https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf, equations (86) - (89)
 97 |         n_delta = n*delta
 98 |         kappa_post = kappa + n_delta
 99 |         mu_post = (kappa * mu + n_delta * y_sum / n) / kappa_post
100 |         alpha_post = alpha + 0.5*n_delta
101 |         beta_post = beta + 0.5 * delta * (y_squared_sum - y_sum ** 2 / n) + 0.5 * kappa * n_delta * (
102 |                     y_sum / n - mu) ** 2 / (kappa + n)
103 | 
104 |         return mu_post, kappa_post, alpha_post, beta_post
105 | 
106 |     def _compute_posterior_mean(self):
107 |         return self.posterior_[0]  # mu is the posterior mean
108 | 
109 |     def _compute_log_p_data(self, prior, alpha_new, beta_new, kappa_new, n_new):
110 |         mu, kappa, alpha, beta = prior
111 | 
112 |         # see https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf, equation (95)
113 |         return (gammaln(alpha_new) - gammaln(alpha)
114 |                 + alpha*np.log(beta) - alpha_new*np.log(beta_new)
115 |                 + 0.5*np.log(kappa/kappa_new)
116 |                 - 0.5*n_new*np.log(2*np.pi))
117 | 
118 |     def _predict_leaf(self):
119 |         # predict posterior mean
120 |         return self._compute_posterior_mean()
121 | 
122 |     def _get_raw_leaf_data_internal(self):
123 |         # prior and posterior raw data
124 |         return np.array([self.prior, self.posterior_])
125 | 
126 | 
127 | class PerpendicularRegressionTree(BasePerpendicularTree, BaseRegressionTree):
128 |     """
129 |     Bayesian regression tree using axes-normal splits ("perpendicular").
130 |     Uses a Normal-gamma(mu, kappa, alpha, beta) prior assuming unknown mean and unknown variance.
131 | 
132 |     Parameters
133 |     ----------
134 |     partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9
135 |         The prior probability of splitting a node's data into two children.
136 | 
137 |         Small values tend to reduce the tree depth, leading to less expressiveness
138 |         but also to less overfitting.
139 | 
140 |         Large values tend to increase the tree depth and thus lead to the tree
141 |         better fitting the data, which can lead to overfitting.
142 | 
143 |     prior : array_like, shape = [4]
144 |         The prior hyperparameters [mu, kappa, alpha, beta] of the Normal-gamma
145 |         distribution (see also [1], [2], [3]):
146 | 
147 |         - mu:    prior pseudo-observation sample mean
148 |         - kappa: prior pseudo-observation count used to compute mu
149 |         - alpha: (prior pseudo-observation count used to compute sample variance)/2
150 |         - beta:  alpha * (prior pseudo-observation sample variance)
151 | 
152 |         It is usually easier to compute these hyperparameters off more intuitive
153 |         base quantities, see examples section.
154 | 
155 |     delta : float, default=0.0
156 |         Determines the strengthening of the prior as the tree grows deeper,
157 |         see [1]. Must be a value between 0.0 and 1.0.
158 | 
159 |     split_precision : float, default=0.0
160 |         Determines the minimum distance between two contiguous points to consider a split. If the distance is below
161 |         this threshold, the points are considered to overlap along this direction.
162 | 
163 |     level : DO NOT SET, ONLY USED BY SUBCLASSES
164 | 
165 |     See also
166 |     --------
167 |     demo_regression_perpendicular.py
168 |     PerpendicularClassificationTree
169 |     HyperplaneRegressionTree
170 | 
171 |     References
172 |     ----------
173 | 
174 |     .. [1] https://en.wikipedia.org/wiki/Normal-gamma_distribution
175 | 
176 |     .. [2] https://en.wikipedia.org/wiki/Normal-gamma_distribution#Interpretation_of_parameters
177 | 
178 |     .. [3] https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions
179 | 
180 |     Examples
181 |     --------
182 |     It is usually convenient to compute the prior hyperparameters as follows:
183 | 
184 |     >>> # prior mean; set to the mean of the target
185 |     >>> mu = ...
186 |     >>>
187 |     >>> # prior standard deviation; set to about 0.1 times the standard deviation of the target
188 |     >>> sd_prior = ...
189 |     >>>
190 |     >>> # the number of prior pseudo-observations; set to roughly 1 - 10 % of the number of training samples
191 |     >>> prior_pseudo_observations = ...
192 |     >>>
193 |     >>> # now compute the prior
194 |     >>> kappa = prior_pseudo_observations
195 |     >>> alpha = prior_pseudo_observations/2
196 |     >>> beta = alpha*sd_prior**2
197 |     >>> prior = [mu, kappa, alpha, beta]
198 | 
199 |     See `demo_regression_perpendicular.py`.
200 |     """
201 | 
202 |     def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, split_precision=0.0, level=0):
203 |         child_type = PerpendicularRegressionTree
204 |         BasePerpendicularTree.__init__(self, partition_prior, prior, delta, prune, child_type, True, split_precision, level)
205 |         BaseRegressionTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level)
206 | 
207 | 
208 | class HyperplaneRegressionTree(BaseHyperplaneTree, BaseRegressionTree):
209 |     """
210 |     Bayesian regression tree using arbitrarily-oriented hyperplane splits.
211 |     Uses a Normal-gamma(mu, kappa, alpha, beta) prior assuming unknown mean and unknown variance.
212 | 
213 |     Parameters
214 |     ----------
215 |     partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9
216 |         The prior probability of splitting a node's data into two children.
217 | 
218 |         Small values tend to reduce the tree depth, leading to less expressiveness
219 |         but also to less overfitting.
220 | 
221 |         Large values tend to increase the tree depth and thus lead to the tree
222 |         better fitting the data, which can lead to overfitting.
223 | 
224 |     prior : array_like, shape = [4]
225 |         The prior hyperparameters [mu, kappa, alpha, beta] of the Normal-gamma
226 |         distribution (see also [1], [2], [3]):
227 | 
228 |         - mu:    prior pseudo-observation sample mean
229 |         - kappa: prior pseudo-observation count used to compute mu
230 |         - alpha: (prior pseudo-observation count used to compute sample variance)/2
231 |         - beta:  alpha * (prior pseudo-observation sample variance)
232 | 
233 |         It is usually easier to compute these hyperparameters off more intuitive
234 |         base quantities, see examples section.
235 | 
236 |     delta : float, default=0.0
237 |         Determines the strengthening of the prior as the tree grows deeper,
238 |         see [1]. Must be a value between 0.0 and 1.0.
239 | 
240 |     optimizer : object
241 |         A global optimization algorithm object that performs optimal hyperparameter
242 |         orientation search. The available options are (in the order in which you should
243 |         try them):
244 |         - ScipyOptimizer: A wrapper around scipy global optimizers. See usages for examples.
245 |         - SimulatedAnnealingOptimizer: Experimental, but works well with n_scan=20, n_keep=10, spread_factor=0.95
246 |         - RandomHyperplaneOptimizer: Experimental, mediocre performance
247 |         - RandomTwoPointOptimizer: Experimental, mediocre performance
248 |         - GradientDescentOptimizer: Experimental, mediocre performance
249 | 
250 |     split_precision : float, default=0.0
251 |         Determines the minimum distance between two contiguous points to consider a split. If the distance is below
252 |         this threshold, the points are considered to overlap along this direction.
253 | 
254 |     level : DO NOT SET, ONLY USED BY SUBCLASSES
255 | 
256 |     See also
257 |     --------
258 |     demo_regression_hyperplane.py
259 |     HyperplaneClassificationTree
260 |     PerpendicularRegressionTree
261 | 
262 |     References
263 |     ----------
264 | 
265 |     .. [1] https://en.wikipedia.org/wiki/Normal-gamma_distribution
266 | 
267 |     .. [2] https://en.wikipedia.org/wiki/Normal-gamma_distribution#Interpretation_of_parameters
268 | 
269 |     .. [3] https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions
270 | 
271 |     Examples
272 |     --------
273 |     It is usually convenient to compute the prior hyperparameters in the same manner as for
274 |     the perpendicular case, see PerpendicularRegressionTree.
275 | 
276 |     See `demo_regression_hyperplane.py`.
277 |     """
278 | 
279 |     def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, optimizer=None, split_precision=0.0, level=0):
280 |         child_type = HyperplaneRegressionTree
281 |         BaseHyperplaneTree.__init__(self, partition_prior, prior, delta, prune, child_type, True, optimizer, split_precision, level)
282 |         BaseRegressionTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level)
283 | 


--------------------------------------------------------------------------------
/bayesian_decision_tree/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Generator
  2 | 
  3 | import numpy as np
  4 | from scipy.special import betaln, gammaln
  5 | 
  6 | 
  7 | def multivariate_betaln(alphas):
  8 |     if len(alphas) == 2:
  9 |         return betaln(alphas[0], alphas[1])
 10 |     else:
 11 |         # see https://en.wikipedia.org/wiki/Beta_function#Multivariate_beta_function
 12 |         return np.sum([gammaln(alpha) for alpha in alphas], axis=0) - gammaln(np.sum(alphas))
 13 | 
 14 | 
 15 | def r2_series_generator(n_dim: int) -> Generator[np.ndarray, None, None]:
 16 |     """
 17 |     Computes R2 pseudo-random sequence, see
 18 |     http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences/
 19 | 
 20 |     :param n_dim: The number of dimensions of the output
 21 |     :return: R2 series data points
 22 |     """
 23 | 
 24 |     if n_dim == 0:
 25 |         raise ValueError(f'n_dim must be > 0 but was {n_dim}')
 26 | 
 27 |     # compute phi
 28 |     phi = 2
 29 |     phi_old = phi
 30 |     while True:
 31 |         phi = pow(1+phi, 1/(n_dim+1))
 32 |         if phi == phi_old:
 33 |             break
 34 | 
 35 |         phi_old = phi
 36 | 
 37 |     # compute alpha array
 38 |     alpha = 1/phi**(1+np.arange(n_dim))
 39 | 
 40 |     # compute R2 sequence
 41 |     i = 0
 42 |     while True:
 43 |         yield (0.5 + alpha * (i+1)) % 1
 44 |         i += 1
 45 | 
 46 | 
 47 | def hypercube_to_hypersphere_surface(
 48 |         hypercube_points: np.ndarray,
 49 |         half_hypersphere: bool) -> np.ndarray:
 50 |     """
 51 |     Converts uniformly distributed points from a D-dimensional hypercube, [0, 1]^D,
 52 |     to uniformly distributed points on the the D-dimensional surface of a hyperplane
 53 |     (embedded in (D+1)-dimensional space), see algorithm 'YPHL' in
 54 |     https://core.ac.uk/download/pdf/82404670.pdf with 'n' = D and 'd' = 0 (specifying
 55 |     the hypersphere surface rather than the volume)
 56 | 
 57 |     :param hypercube_points: A 2-dimensional array of shape N * D
 58 |     :param half_hypersphere: If True then map the uniform points to the half-hypersphere;
 59 |         if False then map to the full hypersphere
 60 |     :return:
 61 |     """
 62 | 
 63 |     assert 1 <= hypercube_points.ndim <= 2
 64 |     assert np.all(hypercube_points >= 0)
 65 |     assert np.all(hypercube_points <= 1)
 66 | 
 67 |     n_dim_surface = hypercube_points.shape[-1]
 68 |     n_dim_embedding = 1+n_dim_surface
 69 |     if hypercube_points.ndim == 1:
 70 |         hypercube_points = hypercube_points.reshape(1, -1)
 71 |         n_points = 1
 72 |     else:
 73 |         n_points = hypercube_points.shape[0]
 74 | 
 75 |     surface_points = np.zeros((n_dim_embedding, n_points))
 76 | 
 77 |     hypercube_points = hypercube_points.T  # easier if 1st index is the dimension
 78 | 
 79 |     if n_dim_embedding % 2 == 0:
 80 |         # even
 81 |         phi = np.pi * (hypercube_points[0] - 0.5) if half_hypersphere else 2 * np.pi * hypercube_points[0]
 82 |         surface_points[0] = np.cos(phi)
 83 |         surface_points[1] = np.sin(phi)
 84 | 
 85 |         for i in range(1, n_dim_embedding//2):
 86 |             u = hypercube_points[2*i-1]
 87 |             h = u ** (1/(2*i))
 88 |             surface_points[:2*i] *= h
 89 | 
 90 |             sqrt_rho = np.sqrt(np.maximum(0, 1-np.sum(surface_points[:2*i]**2, axis=0)))
 91 |             phi = 2*np.pi * hypercube_points[2*i]
 92 |             surface_points[2*i] = sqrt_rho*np.cos(phi)
 93 |             surface_points[2*i+1] = sqrt_rho*np.sin(phi)
 94 |     else:
 95 |         # odd
 96 |         if half_hypersphere:
 97 |             surface_points[0] = 1
 98 |             next_dim = 1
 99 |         else:
100 |             # see https://mathworld.wolfram.com/SpherePointPicking.html
101 |             assert n_dim_embedding >= 3
102 | 
103 |             phi = np.arccos(2 * hypercube_points[0] - 1)
104 |             theta = 2 * np.pi * hypercube_points[1]
105 |             surface_points[0] = np.sin(phi) * np.cos(theta)
106 |             surface_points[1] = np.sin(phi) * np.sin(theta)
107 |             surface_points[2] = np.cos(phi)
108 |             next_dim = 2
109 | 
110 |             # # **old algorithm, flawed**
111 |             # # in theory x[0] should be the random sign (+/- 1) which would require another
112 |             # # random number, but we don't have that available, so generate pseudo-random
113 |             # # bits from two sources: the data itself (even/odd bit count) and a bit from
114 |             # # a deterministic quasi-random sequence
115 |             # pseudo_random_bits_data = 1 * np.array([np.sum(list(struct.pack('!d', value))) % 2 == 0 for value in hypercube_points.flatten()])
116 |             # pseudo_random_bits_data = pseudo_random_bits_data.reshape(hypercube_points.shape)
117 |             # pseudo_random_bits_data = np.sum(pseudo_random_bits_data, axis=0) % 2 == 0
118 |             #
119 |             # r2gen = r2_series_generator(n_dim=1)
120 |             # pseudo_random_bits_gen = np.array([next(r2gen)[0] > 0.5 for i in range(hypercube_points.shape[1])])
121 |             #
122 |             # pseudo_random_bits = pseudo_random_bits_data ^ pseudo_random_bits_gen
123 |             # surface_points[0] = 2*pseudo_random_bits-1
124 |             # next_dim = 1
125 | 
126 |         for i in range(next_dim, (n_dim_embedding + 1) // 2):
127 |             u = hypercube_points[2 * i - 2]
128 |             h = u ** (1 / (2 * i - 1))
129 |             surface_points[:2 * i - 1] *= h
130 | 
131 |             sqrt_rho = np.sqrt(np.maximum(0, 1 - np.sum(surface_points[:2 * i - 1] ** 2, axis=0)))
132 |             phi = 2 * np.pi * hypercube_points[2 * i - 1]
133 |             surface_points[2 * i - 1] = sqrt_rho * np.cos(phi)
134 |             surface_points[2 * i] = sqrt_rho * np.sin(phi)
135 | 
136 |     surface_points = surface_points.squeeze().T
137 |     surface_points = (surface_points.T / np.linalg.norm(surface_points, axis=-1)).T  # correct numerical round-off errors
138 | 
139 |     return surface_points
140 | 


--------------------------------------------------------------------------------
/conda.recipe/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | python:
2 |   - 3.6
3 |   - 3.7.1
4 | 


--------------------------------------------------------------------------------
/conda.recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | {% set data = load_setup_py_data() %}
 3 | 
 4 | package:
 5 |   name: bayesian_decision_tree
 6 |   
 7 |   version: {{ data['version'] }}
 8 | 
 9 | source:
10 |   path: ..
11 | 
12 | build:
13 |   # If the installation is complex, or different between Unix and Windows, use
14 |   # separate bld.bat and build.sh files instead of this key.  Add the line
15 |   # "skip: True  # [py<35]" (for example) to limit to Python 3.5 and newer, or
16 |   # "skip: True  # [not win]" to limit to Windows.
17 |   script: python setup.py install --single-version-externally-managed --record=record.txt
18 |   # if not platform dependent uncomment
19 |   # uncomment for entry points to be generated
20 |   # entry_points:
21 |   #   - hadoop_utils = hadoop_utils.cli:cli
22 | 
23 | requirements:
24 |   build:
25 |     - python
26 |     - setuptools
27 |   run:
28 |     - python
29 |     {% for dep in data['install_requires'] %}
30 |     - {{ dep.lower() }}
31 |     {% endfor %}
32 |     {# raw is for ignoring templating with cookiecutter, leaving it for use with conda-build #}
33 | 
34 | test:
35 |   source_files:
36 |     - tests
37 |   requires:
38 |     - pytest
39 |     - pytest-cov
40 |     - pytest-flake8
41 |       #    - pytest-mypy
42 |     - teamcity-messages
43 |   commands:
44 |     - pytest 
45 | 
46 | about:
47 |   home: https://github.com/AA42557-QUAD-DS/bayesian_tree
48 |   summary: Short description
49 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UBS-IB/bayesian_tree/718aecc68e7ea527380b8e299b4f7d69e86f7400/examples/__init__.py


--------------------------------------------------------------------------------
/examples/demo_classification_hyperplane.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics import accuracy_score
  3 | 
  4 | from bayesian_decision_tree.classification import HyperplaneClassificationTree
  5 | from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer
  6 | from examples import helper
  7 | 
  8 | # demo script for classification (binary or multiclass) using arbitrarily oriented hyperplanes
  9 | if __name__ == '__main__':
 10 |     # proxies (in case you're running this behind a firewall)
 11 |     args = helper.parse_args()
 12 |     proxies = {
 13 |         'http': args.http_proxy,
 14 |         'https': args.https_proxy
 15 |     }
 16 | 
 17 |     # data set: uncomment one of the following sections
 18 | 
 19 |     # artificial 4-class data somewhat similar to the Ripley data
 20 |     # n_train = 500
 21 |     # n_test = 2000
 22 |     # x0 = [1, 3, 2, 4]
 23 |     # x1 = [1, 1, 3, 3]
 24 |     # sd = 0.7
 25 |     # X_train = np.zeros((n_train, 2))
 26 |     # y_train = np.zeros((n_train, 1))
 27 |     # X_test = np.zeros((n_test, 2))
 28 |     # y_test = np.zeros((n_test, 1))
 29 |     # np.random.seed(666)
 30 |     # for i in range(4):
 31 |     #     X_train[i * n_train//4:(i + 1) * n_train//4, 0] = np.random.normal(x0[i], sd, n_train//4)
 32 |     #     X_train[i * n_train//4:(i + 1) * n_train//4, 1] = np.random.normal(x1[i], sd, n_train//4)
 33 |     #     y_train[i * n_train//4:(i + 1) * n_train//4] = i
 34 |     #
 35 |     #     X_test[i * n_test//4:(i + 1) * n_test//4, 0] = np.random.normal(x0[i], sd, n_test//4)
 36 |     #     X_test[i * n_test//4:(i + 1) * n_test//4, 1] = np.random.normal(x1[i], sd, n_test//4)
 37 |     #     y_test[i * n_test//4:(i + 1) * n_test//4] = i
 38 |     # train = np.hstack((X_train, y_train))
 39 |     # test = np.hstack((X_test, y_test))
 40 | 
 41 |     np.random.seed(5)
 42 | 
 43 |     n = 10000
 44 |     X_train = np.random.uniform(0, 4, (n, 2))
 45 |     y_train = np.zeros((n, 1))
 46 |     y_train[(X_train[:, 0] >= 1) & (X_train[:, 0] < 2) & (X_train[:, 1] <= 3)] = 1
 47 |     y_train[(X_train[:, 0] >= 2) & (X_train[:, 0] < 3) & (X_train[:, 1] <= 1)] = 1
 48 |     y_train[(X_train[:, 0] >= 3)] = 1
 49 | 
 50 |     angle = 30*np.pi/180
 51 |     X_train_rot = X_train.copy()
 52 |     X_train_rot[:, 0] = np.cos(angle)*X_train[:, 0] + np.sin(angle)*X_train[:, 1]
 53 |     X_train_rot[:, 1] = -np.sin(angle)*X_train[:, 0] + np.cos(angle)*X_train[:, 1]
 54 |     X_train = X_train_rot
 55 | 
 56 |     train = np.hstack((X_train, y_train))
 57 |     test = train
 58 | 
 59 |     # or, alternatively, load a UCI dataset
 60 |     # train, test = helper.load_ripley(proxies)
 61 | 
 62 |     n_classes = len(np.unique(train[:, -1]))
 63 | 
 64 |     if train is test:
 65 |         # perform a 50:50 train:test split if no test data is given
 66 |         train = train[0::2]
 67 |         test = test[1::2]
 68 | 
 69 |     X_train = train[:, :-1]
 70 |     y_train = train[:, -1]
 71 |     X_test = test[:, :-1]
 72 |     y_test = test[:, -1]
 73 | 
 74 |     # prior
 75 |     prior_pseudo_observations = 100
 76 |     prior = prior_pseudo_observations * np.ones(n_classes)
 77 | 
 78 |     # model
 79 |     model = HyperplaneClassificationTree(
 80 |         partition_prior=0.9,
 81 |         prior=prior,
 82 |         delta=0,
 83 |         prune=True,
 84 |         optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666))
 85 | 
 86 |     model.fit(X_train, y_train)
 87 |     # train
 88 |     print(model)
 89 |     print()
 90 |     print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves()))
 91 |     print('Feature importance:', model.feature_importance())
 92 | 
 93 |     # compute accuracy
 94 |     y_pred_train = model.predict(X_train)
 95 |     y_pred_test = model.predict(X_test)
 96 |     accuracy_train = accuracy_score(y_train, y_pred_train)
 97 |     accuracy_test = accuracy_score(y_test, y_pred_test)
 98 |     info_train = 'Train accuracy: {:.4f} %'.format(100 * accuracy_train)
 99 |     info_test = 'Test accuracy:  {:.4f} %'.format(100 * accuracy_test)
100 |     print(info_train)
101 |     print(info_test)
102 | 
103 |     # plot if 2D
104 |     dimensions = X_train.shape[1]
105 |     if dimensions == 2:
106 |         helper.plot_2d_hyperplane(model, X_train, y_train, info_train, X_test, y_test, info_test)
107 | 


--------------------------------------------------------------------------------
/examples/demo_classification_perpendicular.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics import accuracy_score
  3 | 
  4 | from bayesian_decision_tree.classification import PerpendicularClassificationTree
  5 | from examples import helper
  6 | 
  7 | # demo script for classification (binary or multiclass) using classic, axis-normal splits
  8 | if __name__ == '__main__':
  9 |     # proxies (in case you're running this behind a firewall)
 10 |     args = helper.parse_args()
 11 |     proxies = {
 12 |         'http': args.http_proxy,
 13 |         'https': args.https_proxy
 14 |     }
 15 | 
 16 |     # data set: uncomment one of the following sections
 17 | 
 18 |     # artificial 4-class data somewhat similar to the Ripley data
 19 |     n_train = 500
 20 |     n_test = 2000
 21 |     x0 = [1, 3, 2, 4]
 22 |     x1 = [1, 1, 3, 3]
 23 |     sd = 0.7
 24 |     X_train = np.zeros((n_train, 2))
 25 |     y_train = np.zeros((n_train, 1))
 26 |     X_test = np.zeros((n_test, 2))
 27 |     y_test = np.zeros((n_test, 1))
 28 |     np.random.seed(666)
 29 |     for i in range(4):
 30 |         X_train[i * n_train//4:(i + 1) * n_train//4, 0] = np.random.normal(x0[i], sd, n_train//4)
 31 |         X_train[i * n_train//4:(i + 1) * n_train//4, 1] = np.random.normal(x1[i], sd, n_train//4)
 32 |         y_train[i * n_train//4:(i + 1) * n_train//4] = i
 33 | 
 34 |         X_test[i * n_test//4:(i + 1) * n_test//4, 0] = np.random.normal(x0[i], sd, n_test//4)
 35 |         X_test[i * n_test//4:(i + 1) * n_test//4, 1] = np.random.normal(x1[i], sd, n_test//4)
 36 |         y_test[i * n_test//4:(i + 1) * n_test//4] = i
 37 |     train = np.hstack((X_train, y_train))
 38 |     test = np.hstack((X_test, y_test))
 39 | 
 40 |     # np.random.seed(5)
 41 |     #
 42 |     # n = 10000
 43 |     # X_train = np.random.uniform(0, 4, (n, 2))
 44 |     # y_train = np.zeros((n, 1))
 45 |     # y_train[(X_train[:, 0] >= 1) & (X_train[:, 0] < 2) & (X_train[:, 1] <= 3)] = 1
 46 |     # y_train[(X_train[:, 0] >= 2) & (X_train[:, 0] < 3) & (X_train[:, 1] <= 1)] = 1
 47 |     # y_train[(X_train[:, 0] >= 3)] = 1
 48 |     #
 49 |     # angle = 30*np.pi/180
 50 |     # X_train_rot = X_train.copy()
 51 |     # X_train_rot[:, 0] = np.cos(angle)*X_train[:, 0] + np.sin(angle)*X_train[:, 1]
 52 |     # X_train_rot[:, 1] = -np.sin(angle)*X_train[:, 0] + np.cos(angle)*X_train[:, 1]
 53 |     # X_train = X_train_rot
 54 |     #
 55 |     # train = np.hstack((X_train, y_train))
 56 |     # test = train
 57 | 
 58 |     # or, alternatively, load a UCI dataset
 59 |     # train, test = helper.load_ripley(proxies)
 60 | 
 61 |     n_classes = len(np.unique(train[:, -1]))
 62 | 
 63 |     if train is test:
 64 |         # perform a 50:50 train:test split if no test data is given
 65 |         train = train[0::2]
 66 |         test = test[1::2]
 67 | 
 68 |     X_train = train[:, :-1]
 69 |     y_train = train[:, -1]
 70 |     X_test = test[:, :-1]
 71 |     y_test = test[:, -1]
 72 | 
 73 |     # prior
 74 |     prior_pseudo_observations = 1
 75 |     prior = prior_pseudo_observations * np.ones(n_classes)
 76 | 
 77 |     # model
 78 |     model = PerpendicularClassificationTree(
 79 |         partition_prior=0.9,
 80 |         prior=prior,
 81 |         delta=0,
 82 |         prune=True)
 83 | 
 84 |     # train
 85 |     model.fit(X_train, y_train)
 86 |     print(model)
 87 |     print()
 88 |     print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves()))
 89 |     print('Feature importance:', model.feature_importance())
 90 | 
 91 |     # compute accuracy
 92 |     y_pred_train = model.predict(X_train)
 93 |     y_pred_test = model.predict(X_test)
 94 |     accuracy_train = accuracy_score(y_train, y_pred_train)
 95 |     accuracy_test = accuracy_score(y_test, y_pred_test)
 96 |     info_train = 'Train accuracy: {:.4f} %'.format(100 * accuracy_train)
 97 |     info_test = 'Test accuracy:  {:.4f} %'.format(100 * accuracy_test)
 98 |     print(info_train)
 99 |     print(info_test)
100 | 
101 |     # plot if 1D or 2D
102 |     dimensions = X_train.shape[1]
103 |     if dimensions == 1:
104 |         helper.plot_1d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test)
105 |     elif dimensions == 2:
106 |         helper.plot_2d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test)
107 | 


--------------------------------------------------------------------------------
/examples/demo_classification_trading.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | from scipy.linalg import expm, inv, eig
  4 | from sklearn.metrics import accuracy_score, plot_confusion_matrix
  5 | from sklearn.neural_network import MLPClassifier
  6 | 
  7 | from bayesian_decision_tree.classification import PerpendicularClassificationTree
  8 | 
  9 | 
 10 | def get_covariance(sigma: float, delta: float, theta: np.ndarray) -> np.ndarray:
 11 |     theta_p = theta + theta.T
 12 |     return (sigma ** 2.0) * inv(theta_p) * (np.eye(theta.shape[0]) - expm(-theta_p * delta))
 13 | 
 14 | 
 15 | def sample_gaussian(n: int, covariance: np.ndarray) -> np.ndarray:
 16 |     d, v = eig(covariance)
 17 |     a = np.dot(v, np.diag(np.sqrt(np.real(d))))
 18 |     g = np.random.normal(0.0, 1.0, (a.shape[0], n))
 19 |     return np.dot(a, g)
 20 | 
 21 | 
 22 | def sample_mean_reversion(n: int, x0: np.ndarray, mu: np.ndarray, sigma: float, delta: float,
 23 |                           theta: np.ndarray) -> np.ndarray:
 24 |     if not positive_eigenvalues(theta):
 25 |         raise AssertionError("Input theta does not have all positive eigenvalues")
 26 |     covariance = get_covariance(sigma, delta, theta)
 27 |     if not positive_eigenvalues(covariance):
 28 |         raise AssertionError("Covariance does not have all positive eigenvalues")
 29 |     gaussian_matrix = sample_gaussian(n, covariance)
 30 |     sample_paths = np.ndarray(gaussian_matrix.shape)
 31 |     sample_paths[:, [0]] = x0
 32 |     exp_theta = expm(-theta * delta)
 33 |     for i in range(1, sample_paths.shape[1]):
 34 |         prev = sample_paths[:, [i - 1]]
 35 |         sample_paths[:, [i]] = mu + np.dot(exp_theta, (prev - mu)) + gaussian_matrix[:, [i - 1]]
 36 |     return sample_paths
 37 | 
 38 | 
 39 | def positive_eigenvalues(theta: np.ndarray) -> bool:
 40 |     d, v = eig(theta)
 41 |     return np.all(np.real(d) > 0.0)
 42 | 
 43 | 
 44 | # demo script for classification (binary or multiclass) using classic, axis-normal splits
 45 | if __name__ == '__main__':
 46 |     np.random.seed(0)
 47 |     default_font_size = 16
 48 |     model_type = 'tree'  # it can be 'tree' or 'nn'
 49 |     plt.rc('axes', titlesize=default_font_size)  # fontsize of the axes title
 50 |     plt.rc('axes', labelsize=default_font_size)  # fontsize of the x and y labels
 51 |     plt.rc('xtick', labelsize=default_font_size)  # fontsize of the tick labels
 52 |     plt.rc('ytick', labelsize=default_font_size)  # fontsize of the tick labels
 53 |     plt.rc('legend', fontsize=default_font_size)  # legend fontsize
 54 |     plt.rc('figure', titlesize=default_font_size)  # fontsize of the figure title
 55 |     n = 10_000
 56 |     n += 1  # used for the deltas
 57 |     mu = np.array([[100.0], [110.0], [105.0]])
 58 |     theta = np.array([[2.0, -0.5, 0.0], [0.2, 1.0, 0.0], [0.0, 0.0, 0.1]])
 59 |     dt = 0.1
 60 |     sigma = 1.0
 61 |     d = mu.shape[0]
 62 |     paths = sample_mean_reversion(n, mu, mu, sigma, dt, theta)
 63 |     x = paths.T
 64 |     plt.plot(x)
 65 |     plt.hlines(mu, 0, n, linestyles=d * ['--'], zorder=100)
 66 |     plt.title('Stock prices')
 67 |     plt.legend(['Stock A', 'Stock B', 'Stock C'])
 68 |     ax = plt.gca()
 69 |     ax.set_xlim([0, n])
 70 |     ax.set_ylim([90, 120])
 71 |     plt.savefig('trading_example_prices.png')
 72 |     plt.show()
 73 | 
 74 |     # artificial 4-class data somewhat similar to the Ripley data
 75 |     y_diff = np.diff(x, axis=0)
 76 |     x = x[:-1, :]
 77 |     y = np.dot((np.sign(y_diff) + 1) / 2, np.reshape(2.0 ** np.arange(d), (d, 1))).astype(int)
 78 |     n_train = int(x.shape[0] * 0.8)
 79 |     X_train = x[:n_train, :]
 80 |     y_train = y[:n_train, :]
 81 |     X_test = x[n_train:, :]
 82 |     y_test = y[n_train:, :]
 83 |     y_diff_test = y_diff[n_train:, :]
 84 |     n_classes = len(np.unique(y))
 85 | 
 86 |     # prior
 87 |     prior_strength = 1
 88 |     prior = prior_strength * np.array(n_classes * [1.0]) / n_classes
 89 | 
 90 |     # model
 91 |     if model_type is 'tree':
 92 |         model = PerpendicularClassificationTree(
 93 |             partition_prior=0.9,
 94 |             prior=prior,
 95 |             delta=0,
 96 |             prune=False)
 97 |     elif model_type is 'nn':
 98 |         model = MLPClassifier(
 99 |             hidden_layer_sizes=(10, 10),
100 |             random_state=0)
101 |     else:
102 |         raise AssertionError('Model not included ' + model_type)
103 | 
104 |     # train
105 |     model.fit(X_train, y_train)
106 |     print(model)
107 |     print()
108 | 
109 |     # compute accuracy
110 |     y_pred_train = model.predict(X_train)
111 |     y_pred_test = model.predict(X_test)
112 |     positions = (2 * (y_pred_test.reshape((y_pred_test.shape[0], 1)) // 2.0 ** np.arange(d).astype(int) % 2) - 1)
113 |     accuracy_train = accuracy_score(y_train, y_pred_train)
114 |     accuracy_test = accuracy_score(y_test, y_pred_test)
115 |     info_train = 'Train accuracy: {:.4f} %'.format(100 * accuracy_train)
116 |     info_test = 'Test accuracy:  {:.4f} %'.format(100 * accuracy_test)
117 |     print(info_train)
118 |     print(info_test)
119 | 
120 |     pnl = np.cumsum(positions * y_diff_test, axis=0)
121 |     plt.plot(pnl)
122 |     plt.hlines(0, 0, pnl.shape[0])
123 |     ax = plt.gca()
124 |     ax.set_xlim([0, pnl.shape[0]])
125 |     ax.set_ylim(np.array([-30, 200]))
126 |     plt.grid(True)
127 |     plt.title('Test period PnL')
128 |     plt.legend(['Stock A', 'Stock B', 'Stock C'])
129 |     plt.savefig('trading_example_pnl_' + model_type + '.png')
130 |     plt.show()
131 | 
132 |     disp = plot_confusion_matrix(model, X_test, y_test,
133 |                                  display_labels=[''.join(
134 |                                      np.core.defchararray.add(['-' if x < 0 else '+' for x in (2 * row - 1)],
135 |                                                               ['A', 'B', 'C'])) for row in
136 |                                      np.reshape(np.arange(2 ** d), (2 ** d, 1)) // 2.0 ** np.arange(
137 |                                          d).astype(int) % 2],
138 |                                  cmap=plt.cm.Blues,
139 |                                  normalize='true')
140 |     disp.ax_.set_title('Test period confusion matrix')
141 |     plt.xticks(rotation=90)
142 |     plt.savefig('trading_example_confusion_matrix_' + model_type + '.png', bbox_inches='tight')
143 |     plt.show()
144 | 


--------------------------------------------------------------------------------
/examples/demo_regression_hyperplane.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import mean_squared_error
 3 | 
 4 | from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer
 5 | from bayesian_decision_tree.regression import HyperplaneRegressionTree
 6 | from examples import helper
 7 | 
 8 | # demo script for regression using using arbitrarily oriented hyperplanes
 9 | if __name__ == '__main__':
10 |     # proxies (in case you're running this behind a firewall)
11 |     args = helper.parse_args()
12 |     proxies = {
13 |         'http': args.http_proxy,
14 |         'https': args.https_proxy
15 |     }
16 | 
17 |     # data set: uncomment one of the following sections
18 | 
19 |     # # synthetic sine wave
20 |     # X_train = np.linspace(0, 10, 100).reshape(-1, 1)
21 |     # y_train = 1 * np.sin(np.linspace(0, 10, 100)).reshape(-1, 1)
22 |     # train = np.hstack((X_train, y_train))
23 |     # test = train
24 | 
25 |     # or, alternatively, load a UCI dataset (where we *regress* on the class labels, i.e., class 1 = 0.0 and class 2 = 1.0)
26 |     train, test = helper.load_ripley(proxies)
27 | 
28 |     n_dim = len(np.unique(train[:, -1]))
29 | 
30 |     if train is test:
31 |         # perform a 50:50 train:test split if no test data is given
32 |         train = train[0::2]
33 |         test = test[1::2]
34 | 
35 |     X_train = train[:, :-1]
36 |     y_train = train[:, -1]
37 |     X_test = test[:, :-1]
38 |     y_test = test[:, -1]
39 | 
40 |     # prior for regression: Normal-Gamma prior, see https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions
41 |     mu = y_train.mean()
42 |     sd_prior = y_train.std() / 10
43 |     prior_pseudo_observations = 10
44 |     kappa = prior_pseudo_observations
45 |     alpha = prior_pseudo_observations / 2
46 |     var_prior = sd_prior**2
47 |     tau_prior = 1/var_prior
48 |     beta = alpha/tau_prior
49 |     prior = np.array([mu, kappa, alpha, beta])
50 | 
51 |     # model
52 |     model = HyperplaneRegressionTree(
53 |         partition_prior=0.9,
54 |         prior=prior,
55 |         delta=0,
56 |         optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666))
57 | 
58 |     # train
59 |     model.fit(X_train, y_train)
60 |     print(model)
61 |     print()
62 |     print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves()))
63 |     print('Feature importance:', model.feature_importance())
64 | 
65 |     # compute RMSE
66 |     rmse_train = np.sqrt(mean_squared_error(model.predict(X_train), y_train))
67 |     rmse_test = np.sqrt(mean_squared_error(model.predict(X_test), y_test))
68 |     info_train = 'RMSE train: {:.4f}'.format(rmse_train)
69 |     info_test = 'RMSE test:  {:.4f}'.format(rmse_test)
70 |     print(info_train)
71 |     print(info_test)
72 | 
73 |     # plot if 2D
74 |     dimensions = X_train.shape[1]
75 |     if dimensions == 2:
76 |         helper.plot_2d_hyperplane(model, X_train, y_train, info_train, X_test, y_test, info_test)
77 | 


--------------------------------------------------------------------------------
/examples/demo_regression_perpendicular.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import mean_squared_error
 3 | 
 4 | from bayesian_decision_tree.regression import PerpendicularRegressionTree
 5 | from examples import helper
 6 | 
 7 | # demo script for regression using classic, axis-normal splits
 8 | if __name__ == '__main__':
 9 |     # proxies (in case you're running this behind a firewall)
10 |     args = helper.parse_args()
11 |     proxies = {
12 |         'http': args.http_proxy,
13 |         'https': args.https_proxy
14 |     }
15 | 
16 |     # data set: uncomment one of the following sections
17 | 
18 |     # # synthetic sine wave
19 |     # X_train = np.linspace(0, 10, 100).reshape(-1, 1)
20 |     # y_train = 1 * np.sin(np.linspace(0, 10, 100)).reshape(-1, 1)
21 |     # train = np.hstack((X_train, y_train))
22 |     # test = train
23 | 
24 |     # or, alternatively, load a UCI dataset (where we *regress* on the class labels, i.e., class 1 = 0.0 and class 2 = 1.0)
25 |     train, test = helper.load_ripley(proxies)
26 | 
27 |     n_dim = len(np.unique(train[:, -1]))
28 | 
29 |     if train is test:
30 |         # perform a 50:50 train:test split if no test data is given
31 |         train = train[0::2]
32 |         test = test[1::2]
33 | 
34 |     X_train = train[:, :-1]
35 |     y_train = train[:, -1]
36 |     X_test = test[:, :-1]
37 |     y_test = test[:, -1]
38 | 
39 |     # prior for regression: Normal-Gamma prior, see https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions
40 |     mu = y_train.mean()
41 |     sd_prior = y_train.std() / 10
42 |     prior_pseudo_observations = 1
43 |     kappa = prior_pseudo_observations
44 |     alpha = prior_pseudo_observations / 2
45 |     var_prior = sd_prior**2
46 |     tau_prior = 1/var_prior
47 |     beta = alpha/tau_prior
48 |     prior = np.array([mu, kappa, alpha, beta])
49 | 
50 |     # model
51 |     model = PerpendicularRegressionTree(
52 |         partition_prior=0.9,
53 |         prior=prior,
54 |         delta=0)
55 | 
56 |     # train
57 |     model.fit(X_train, y_train)
58 |     print(model)
59 |     print()
60 |     print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves()))
61 |     print('Feature importance:', model.feature_importance())
62 | 
63 |     # compute RMSE
64 |     rmse_train = np.sqrt(mean_squared_error(model.predict(X_train), y_train))
65 |     rmse_test = np.sqrt(mean_squared_error(model.predict(X_test), y_test))
66 |     info_train = 'RMSE train: {:.4f}'.format(rmse_train)
67 |     info_test = 'RMSE test:  {:.4f}'.format(rmse_test)
68 |     print(info_train)
69 |     print(info_test)
70 | 
71 |     # plot if 1D or 2D
72 |     dimensions = X_train.shape[1]
73 |     if dimensions == 1:
74 |         helper.plot_1d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test)
75 |     elif dimensions == 2:
76 |         helper.plot_2d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test)
77 | 


--------------------------------------------------------------------------------
/examples/helper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A collection of publicly available data sets to test classification models on,
  3 |  plus some helper functions for plotting.
  4 | """
  5 | import argparse
  6 | import io
  7 | from dataclasses import dataclass
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import pandas as pd
 12 | import requests
 13 | from matplotlib import patches
 14 | from sklearn.preprocessing import LabelBinarizer
 15 | 
 16 | 
 17 | def parse_args():
 18 |     """Parse input arguments from the command line
 19 |     :return: the result from the ArgumentParser
 20 |     """
 21 |     parser = argparse.ArgumentParser(
 22 |         description="Run demo of binary classification")
 23 | 
 24 |     parser.add_argument(
 25 |         '--http_proxy',
 26 |         action='store',
 27 |         required=False,
 28 |         help='HTTP Proxy',
 29 |         default=None)
 30 | 
 31 |     parser.add_argument(
 32 |         '--https_proxy',
 33 |         action='store', required=False,
 34 |         help='HTTPS Proxy',
 35 |         default=None)
 36 | 
 37 |     return parser.parse_args()
 38 | 
 39 | 
 40 | def one_hot_encode(data, columns):
 41 |     columns = sorted(set(columns))[::-1]
 42 | 
 43 |     def ensure_matrix(x):
 44 |         return x if x.ndim == 2 else np.array(x).reshape(-1, 1)
 45 | 
 46 |     for c in columns:
 47 |         one_hot = LabelBinarizer().fit_transform(data[:, c])
 48 |         data = np.hstack((
 49 |             ensure_matrix(data[:, :c]),
 50 |             ensure_matrix(one_hot),
 51 |             ensure_matrix(data[:, c+1:])
 52 |         ))
 53 | 
 54 |     return data
 55 | 
 56 | 
 57 | def load_credit(proxies):
 58 |     content = requests.get(
 59 |         'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls',
 60 |         proxies=proxies).content
 61 |     df = pd.read_excel(io.BytesIO(content))
 62 |     train = df.iloc[1:, 1:].values.astype(np.float64)
 63 |     train = one_hot_encode(train, [2, 3])  # one-hot encode categorical features
 64 |     test = train
 65 |     return train, test
 66 | 
 67 | 
 68 | def load_dermatology(proxies):
 69 |     # Dermatology
 70 |     text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data', proxies=proxies).text
 71 |     lines = text.split('\n')
 72 |     lines = [line for line in lines if '?' not in line]
 73 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
 74 |     train[:, -1] -= 1
 75 |     test = train
 76 |     return train, test
 77 | 
 78 | 
 79 | def load_diabetic(proxies):
 80 |     # Diabetic Retinopathy
 81 |     text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00329/messidor_features.arff', proxies=proxies).text
 82 |     text = text[text.index('@data'):]
 83 |     lines = text.split('\n')[1:]
 84 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
 85 |     test = train
 86 |     return train, test
 87 | 
 88 | 
 89 | def load_eeg(proxies):
 90 |     # load EEG eye data
 91 |     text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff', proxies=proxies).text
 92 |     text = text[text.index('@DATA'):]
 93 |     lines = text.split('\n')[1:]
 94 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
 95 |     test = train
 96 |     return train, test
 97 | 
 98 | 
 99 | def load_gamma(proxies):
100 |     # load MAGIC Gamma telescope data
101 |     text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data', proxies=proxies).text
102 |     text = text.replace('g', '0').replace('h', '1')
103 |     lines = text.split('\n')
104 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
105 |     test = train
106 |     return train, test
107 | 
108 | 
109 | def load_glass(proxies):
110 |     # load glass identificaion data
111 |     text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', proxies=proxies).text
112 |     lines = text.split('\n')
113 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
114 |     train = train[:, 1:]  # ignore ID row
115 |     train[:, -1] -= 1  # convert 1..7 to 0..6
116 |     train[np.where(train[:, -1] >= 4)[0], -1] -= 1  # skip missing class
117 |     test = train
118 |     return train, test
119 | 
120 | 
121 | def load_haberman(proxies):
122 |     # load Haberman's dataset
123 |     text = requests.get(
124 |         'https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data',
125 |         proxies=proxies).text
126 |     lines = text.split('\n')
127 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
128 |     train[:, -1] -= 1
129 |     test = train
130 |     return train, test
131 | 
132 | 
133 | def load_heart(proxies):
134 |     text = requests.get(
135 |         'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', proxies=proxies).text
136 |     lines = text.split('\n')
137 |     train = np.vstack([np.fromstring(lines[i], sep=' ') for i in range(len(lines)-1)])
138 |     train = one_hot_encode(train, [2, 6, 12])  # one-hot encode categorical features
139 |     train[:, -1] -= 1
140 |     test = train
141 |     return train, test
142 | 
143 | 
144 | def load_ripley(proxies):
145 |     # load Ripley's synthetic dataset
146 |     def parse_ripley(text):
147 |         lines = text.split('\n')[1:]
148 |         return np.vstack([np.fromstring(lines[i], sep=' ') for i in range(len(lines)-1)])
149 |     train = parse_ripley(requests.get('https://www.stats.ox.ac.uk/pub/PRNN/synth.tr', proxies=proxies).text)
150 |     test = parse_ripley(requests.get('https://www.stats.ox.ac.uk/pub/PRNN/synth.te', proxies=proxies).text)
151 |     return train, test
152 | 
153 | 
154 | def load_seeds(proxies):
155 |     # load wheat seeds dataset
156 |     def parse_ripley(text):
157 |         lines = text.split('\n')
158 |         return np.vstack([np.fromstring(lines[i], sep=' ') for i in range(len(lines)-1)])
159 |     train = parse_ripley(requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt', proxies=proxies).text)
160 |     train[:, -1] -= 1
161 |     test = train
162 |     return train, test
163 | 
164 | 
165 | def load_seismic(proxies):
166 |     # load seismic bumps dataset
167 |     text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff', proxies=proxies).text
168 |     text = text[text.index('@data'):]
169 |     text = text.replace('a', '0').replace('b', '1').replace('c', '2').replace('d', '3')
170 |     text = text.replace('N', '0').replace('W', '1')
171 |     lines = text.split('\n')[1:]
172 |     train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)])
173 |     test = train
174 |     return train, test
175 | 
176 | 
177 | def plot_1d_perpendicular(root, X_train, y_train, info_train, X_test, y_test, info_test):
178 |     plt.figure(figsize=[10, 16], dpi=75)
179 |     plt.subplot(211)
180 |     plt.plot(X_train[:, 0], y_train, 'o-')
181 |     plt.title(info_train)
182 |     draw_node_1d_perpendicular(root, bounds=(X_train[:, 0].min(), X_train[:, 0].max()))
183 |     plt.xlabel('x0')
184 |     plt.ylabel('x1')
185 |     plt.legend()
186 |     plt.gca().set_aspect(1)
187 | 
188 |     plt.subplot(212)
189 |     plt.plot(X_test[:, 0], y_test, 'o-')
190 |     draw_node_1d_perpendicular(root, bounds=(X_test[:, 0].min(), X_test[:, 0].max()))
191 |     plt.title(info_test)
192 |     plt.xlabel('x0')
193 |     plt.ylabel('x1')
194 |     plt.legend()
195 |     plt.gca().set_aspect(1)
196 | 
197 |     plt.show()
198 | 
199 | 
200 | def plot_2d_perpendicular(root, X_train, y_train, info_train, X_test, y_test, info_test):
201 |     plt.figure(figsize=[10, 16], dpi=75)
202 | 
203 |     n_classes = int(y_train.max()) + 1
204 |     colormap = plt.get_cmap('gist_rainbow')
205 | 
206 |     def plot(X, y, info):
207 |         for i in range(n_classes)[::-1]:
208 |             class_i = y == i
209 |             plt.plot(X[np.where(class_i)[0], 0],
210 |                      X[np.where(class_i)[0], 1],
211 |                      'o',
212 |                      ms=4,
213 |                      c=colormap(i/n_classes),
214 |                      label='Class {}'.format(i),
215 |                      alpha=0.5)
216 | 
217 |             bounds = ((X[:, 0].min(), X[:, 0].max()), (X[:, 1].min(), X[:, 1].max()))
218 |             draw_node_2d_perpendicular(root, bounds, colormap, n_classes)
219 |         plt.title(info)
220 |         plt.xlabel('x0')
221 |         plt.ylabel('x1')
222 |         plt.legend()
223 | 
224 |     plt.subplot(211)
225 |     plot(X_train, y_train, info_train)
226 |     plt.gca().set_aspect(1)
227 | 
228 |     plt.subplot(212)
229 |     plot(X_test, y_test, info_test)
230 |     plt.gca().set_aspect(1)
231 | 
232 |     plt.show()
233 | 
234 | 
235 | def draw_node_2d_perpendicular(node, bounds, colormap, n_classes):
236 |     if node.is_leaf():
237 |         x = bounds[0][0]
238 |         y = bounds[1][0]
239 |         w = bounds[0][1] - x
240 |         h = bounds[1][1] - y
241 | 
242 |         mean = node._compute_posterior_mean()
243 |         if not node.is_regression:
244 |             mean = (np.arange(len(mean)) * mean).sum()
245 | 
246 |         plt.gca().add_patch(patches.Rectangle((x, y), w, h, color=colormap(mean/n_classes), alpha=0.1, linewidth=0))
247 |     else:
248 |         draw_node_2d_perpendicular(node.child1_, compute_child_bounds_2d_perpendicular(bounds, node, True), colormap, n_classes)
249 |         draw_node_2d_perpendicular(node.child2_, compute_child_bounds_2d_perpendicular(bounds, node, False), colormap, n_classes)
250 | 
251 | 
252 | def compute_child_bounds_2d_perpendicular(bounds, parent, lower):
253 |     b = bounds[parent.split_dimension_]
254 |     b = (b[0], min(b[1], parent.split_value_)) if lower else (max(b[0], parent.split_value_), b[1])
255 |     return (b, bounds[1]) if parent.split_dimension_ == 0 else (bounds[0], b)
256 | 
257 | 
258 | def compute_child_bounds_1d_perpendicular(bounds, parent, lower):
259 |     b = bounds
260 |     b = (b[0], min(b[1], parent.split_value_)) if lower else (max(b[0], parent.split_value_), b[1])
261 |     return b
262 | 
263 | 
264 | def draw_node_1d_perpendicular(node, bounds):
265 |     if node.is_leaf():
266 |         x0 = bounds[0]
267 |         x1 = bounds[1]
268 | 
269 |         mean = node._compute_posterior_mean()
270 |         # alpha = np.abs(mean-0.5)
271 |         # alpha = max(0.1, alpha)  # make sure very faint colors become visibly colored
272 |         # color = color0 if mean < 0.5 else color1
273 |         plt.plot([x0, x1], [mean, mean], 'r')
274 |     else:
275 |         draw_node_1d_perpendicular(node.child1_, compute_child_bounds_1d_perpendicular(bounds, node, True))
276 |         draw_node_1d_perpendicular(node.child2_, compute_child_bounds_1d_perpendicular(bounds, node, False))
277 | 
278 | 
279 | class Line:
280 |     def __init__(self, p0, p1):
281 |         if p0[0] > p1[0]:
282 |             p1, p0 = p0, p1
283 | 
284 |         self.p0 = np.asarray(p0)
285 |         self.p1 = np.asarray(p1)
286 | 
287 |     def intersect(self, other):
288 |         da = self.p1-self.p0
289 |         ma = da[1]/da[0]
290 | 
291 |         db = other.p1-other.p0
292 |         mb = db[1]/db[0]
293 | 
294 |         x0a = self.p0[0]
295 |         x1a = self.p1[0]
296 |         x0b = other.p0[0]
297 |         x1b = other.p1[0]
298 |         y0a = self.p0[1]
299 |         y0b = other.p0[1]
300 | 
301 |         x = (y0a-y0b + mb*x0b-ma*x0a) / (mb-ma)
302 |         y = y0a + ma*(x-x0a)
303 | 
304 |         if x0a <= x <= x1a and x0b <= x <= x1b:
305 |             return np.array([x, y])
306 |         else:
307 |             return None
308 | 
309 |     def plot(self, *args, **kwargs):
310 |         plt.plot([self.p0[0], self.p1[0]], [self.p0[1], self.p1[1]], *args, **kwargs)
311 | 
312 |     def __str__(self):
313 |         return f'{self.p0} -> {self.p1}'
314 | 
315 | 
316 | @dataclass
317 | class Parent:
318 |     line: Line
319 |     origin: np.ndarray
320 |     normal: np.ndarray
321 |     side: str
322 | 
323 | 
324 | # plots the root node split and all child nodes recursively
325 | def plot_root(root, X, y, title, cmap):
326 |     plt.title(title)
327 | 
328 |     plt.plot(X[y == 0, 0], X[y == 0, 1], 'b.', ms=3)
329 |     plt.plot(X[y == 1, 0], X[y == 1, 1], 'r.', ms=3)
330 | 
331 |     x_min = X[:, 0].min()
332 |     x_max = X[:, 0].max()
333 |     y_min = X[:, 1].min()
334 |     y_max = X[:, 1].max()
335 | 
336 |     top = Line([x_min, y_max], [x_max, y_max])
337 |     bottom = Line([x_min, y_min], [x_max, y_min])
338 | 
339 |     def plot_node(node, node_vs_color={}, level=0, parents=[], side=None):
340 |         if node.best_hyperplane_origin_ is None:
341 |             return
342 | 
343 |         # pick an arbitrary origin and get the normal
344 |         origin = node.best_hyperplane_origin_
345 |         normal = node.best_hyperplane_normal_
346 | 
347 |         # construct line segment
348 |         m = -normal[0]/normal[1]
349 |         y0 = origin[1] + m*(x_min-origin[0])
350 |         y1 = origin[1] + m*(x_max-origin[0])
351 | 
352 |         # raw line without intersections
353 |         line = Line([x_min, y0], [x_max, y1])
354 | 
355 |         # intersect with parents
356 |         for parent in parents:
357 |             p = line.intersect(parent.line)
358 |             if p is not None:
359 |                 # determine side of line to keep
360 |                 activation0 = np.dot(line.p0 - parent.origin, parent.normal)
361 | 
362 |                 if (parent.side == 'L' and activation0 > 0) or (parent.side == 'R' and activation0 < 0):
363 |                     line = Line(line.p0, p)
364 |                 else:
365 |                     line = Line(p, line.p1)
366 | 
367 |         # intersect with top/bottom
368 |         p = line.intersect(top)
369 |         if p is not None:
370 |             if y0 > y_max:
371 |                 line = Line(p, line.p1)
372 |             else:
373 |                 line = Line(line.p0, p)
374 | 
375 |         p = line.intersect(bottom)
376 |         if p is not None:
377 |             if y0 < y_min:
378 |                 line = Line(p, line.p1)
379 |             else:
380 |                 line = Line(line.p0, p)
381 | 
382 |         # generate line name
383 |         if side is not None:
384 |             side_name = ' - '.join(f'{parents[i].side}{level-len(parents)+i+1}' for i in range(len(parents)))
385 |         else:
386 |             side_name = ''
387 | 
388 |         side_name = 'Root' if len(side_name) == 0 else 'Root - ' + side_name
389 | 
390 |         # make sure node colors don't change
391 |         if id(node) not in node_vs_color:
392 |             color = cmap(len(node_vs_color))
393 |             node_vs_color[id(node)] = color
394 |         else:
395 |             color = node_vs_color[id(node)]
396 | 
397 |         # compute line width as a function of the stiffness
398 |         stiffness = np.linalg.norm(normal)
399 |         lw = 2  # 100/stiffness
400 | 
401 |         line.plot(color=color, label=side_name, lw=lw, alpha=0.7)
402 | 
403 |         if node.child1_:
404 |             plot_node(node.child1_, node_vs_color, level+1, parents=parents + [Parent(line, origin, normal, 'L')], side='L')
405 | 
406 |         if node.child2_:
407 |             plot_node(node.child2_, node_vs_color, level+1, parents=parents + [Parent(line, origin, normal, 'R')], side='R')
408 | 
409 |     plot_node(root)
410 | 
411 | 
412 | def plot_2d_hyperplane(root, X_train, y_train, info_train, X_test, y_test, info_test):
413 |     plt.figure(figsize=[10, 16], dpi=75)
414 | 
415 |     n_classes = int(y_train.max()) + 1
416 |     colormap = plt.get_cmap('gist_rainbow')
417 | 
418 |     x_min = min(X_train[:, 0].min(), X_test[:, 0].min())
419 |     x_max = max(X_train[:, 0].max(), X_test[:, 0].max())
420 |     y_min = min(X_train[:, 1].min(), X_test[:, 1].min())
421 |     y_max = max(X_train[:, 1].max(), X_test[:, 1].max())
422 | 
423 |     def plot(X, y, info):
424 |         for i in range(n_classes):
425 |             class_i = y == i
426 |             plt.plot(X[np.where(class_i)[0], 0],
427 |                      X[np.where(class_i)[0], 1],
428 |                      'o',
429 |                      ms=4,
430 |                      c=colormap(i/n_classes),
431 |                      label='Class {}'.format(i))
432 | 
433 |         plot_root(root, X, y, info, plt.get_cmap('tab20'))
434 | 
435 |         plt.title(info)
436 |         plt.xlabel('x0')
437 |         plt.ylabel('x1')
438 |         plt.legend()
439 | 
440 |     plt.subplot(211)
441 |     plot(X_train, y_train, info_train)
442 |     plt.xlim((x_min, x_max))
443 |     plt.ylim((y_min, y_max))
444 |     plt.gca().set_aspect(1)
445 | 
446 |     plt.subplot(212)
447 |     plot(X_test, y_test, info_test)
448 |     plt.xlim((x_min, x_max))
449 |     plt.ylim((y_min, y_max))
450 |     plt.gca().set_aspect(1)
451 | 
452 |     plt.show()
453 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy-darts.hadoop_utils._version]
2 | ignore_errors=True
3 | [mypy-versioneer]
4 | ignore_errors=True
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 100
 3 | ignore = E122,E123,E126,E127,E128,E731,E722
 4 | exclude = build,bayesian_decision_tree/_version.py,tests,conda.recipe,.git,versioneer.py,benchmarks,.asv
 5 | 
 6 | [tool:pytest]
 7 | norecursedirs= .* *.egg* build dist conda.recipe
 8 | addopts =
 9 |     --junitxml=junit.xml
10 |     --ignore setup.py
11 |     --ignore run_test.py
12 |     --cov-report term-missing
13 |     --tb native
14 |     --strict
15 |     --durations=20
16 | #    --mypy
17 |     --flake8
18 |     --cov=bayesian_decision_tree
19 | env =
20 |     PYTHONHASHSEED=0
21 | markers =
22 |     serial: execute test serially (to avoid race conditions)
23 | 
24 | [versioneer]
25 | VCS = git
26 | style = pep440-pre
27 | versionfile_source = bayesian_decision_tree/_version.py
28 | versionfile_build = bayesian_decision_tree/_version.py
29 | tag_prefix = ver-
30 | parentdir_prefix = bayesian_decision_tree
31 | 
32 | [bdist_wheel]
33 | universal=1
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2018-2019 UBS AG
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | from setuptools import setup, find_packages
18 | from os import path
19 | import versioneer
20 | 
21 | 
22 | here = path.abspath(path.dirname(__file__))
23 | with open(path.join(here, "README.md")) as l:
24 |     long_description = l.read()
25 | 
26 | requirements = [
27 |     'matplotlib>=2.2.*',
28 |     'scipy>=1.2.*',
29 |     'numpy>=1.13.*',
30 |     'pandas>=0.23.*',
31 |     'requests==2.21.0',
32 |     'scikit-learn>=0.19.*',
33 | ]
34 | 
35 | setup(
36 |     name='bayesian-decision-tree',
37 |     version=versioneer.get_version(),
38 |     description='An implementation of the paper: A Bayesian Tree Algorithm by Nuti et al.',
39 |     long_description=long_description,
40 |     long_description_content_type="text/markdown",
41 |     url='https://github.com/UBS-IB/bayesian_tree',
42 |     author='UBS SDL Data Science',
43 |     author_email='dl-frc-sdl-datascience@ubs.com',
44 |     classifiers=[
45 |         'Development Status :: 3 - Alpha',
46 |         'Intended Audience :: Developers',
47 |         'Natural Language :: English',
48 |         'Programming Language :: Python :: 3.5',
49 |         'License :: OSI Approved :: Apache License Version 2.0',
50 |         "Operating System :: OS Independent",
51 |     ],
52 |     packages=find_packages(exclude=['contrib', 'docs', 'tests']),
53 |     install_requires=requirements,
54 | )
55 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UBS-IB/bayesian_tree/718aecc68e7ea527380b8e299b4f7d69e86f7400/tests/__init__.py


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UBS-IB/bayesian_tree/718aecc68e7ea527380b8e299b4f7d69e86f7400/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/helper.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
 3 | from scipy.sparse import csc_matrix, csr_matrix
 4 | 
 5 | from bayesian_decision_tree.classification import PerpendicularClassificationTree, HyperplaneClassificationTree
 6 | from bayesian_decision_tree.hyperplane_optimization import ScipyOptimizer, RandomTwoPointOptimizer
 7 | from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer, RandomHyperplaneOptimizer
 8 | from bayesian_decision_tree.regression import PerpendicularRegressionTree, HyperplaneRegressionTree
 9 | 
10 | # possible data matrix types/transforms that need to work for fit()
11 | data_matrix_transforms = [
12 |     lambda X: X,
13 |     lambda X: csc_matrix(X),
14 |     lambda X: csr_matrix(X),
15 |     lambda X: pd.DataFrame(data=X, columns=['col-{}'.format(i) for i in range(len(X[0]))]),
16 | ]
17 | 
18 | 
19 | # classification tree models in all flavours
20 | def create_classification_trees(prior, partition_prior, prune=False):
21 |     return [
22 |         PerpendicularClassificationTree(partition_prior, prior, prune=prune),
23 |         HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune),
24 |         HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=ScipyOptimizer(DifferentialEvolutionSolver, 666)),
25 |         HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=RandomTwoPointOptimizer(100, 666)),
26 |         HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=RandomHyperplaneOptimizer(100, 666)),
27 |         HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666)),
28 |     ]
29 | 
30 | 
31 | # regression tree models in all flavours
32 | def create_regression_trees(prior, partition_prior):
33 |     return [
34 |         PerpendicularRegressionTree(partition_prior, prior),
35 |         HyperplaneRegressionTree(partition_prior, prior),
36 |         HyperplaneRegressionTree(partition_prior, prior, optimizer=ScipyOptimizer(DifferentialEvolutionSolver, 666)),
37 |         HyperplaneRegressionTree(partition_prior, prior, optimizer=RandomHyperplaneOptimizer(100, 666)),
38 |         HyperplaneRegressionTree(partition_prior, prior, optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666)),
39 |     ]
40 | 


--------------------------------------------------------------------------------
/tests/unit/test_classification.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from numpy.random import normal, randint
  6 | from numpy.testing import assert_array_equal, assert_array_almost_equal
  7 | 
  8 | from bayesian_decision_tree.classification import PerpendicularClassificationTree
  9 | from tests.unit.helper import data_matrix_transforms, create_classification_trees
 10 | 
 11 | 
 12 | class ClassificationTreeTest(TestCase):
 13 |     def test_cannot_fit_with_bad_dimensions(self):
 14 |         np.random.seed(6666)
 15 |         for good_X in [normal(0, 1, [10, 10])]:
 16 |             for bad_y in [randint(0, 2, []), randint(0, 2, [10, 10]), randint(0, 2, [11]), randint(0, 2, [10, 10, 10])]:
 17 |                 for model in create_classification_trees(np.array([1, 1]), 0.5):
 18 |                     try:
 19 |                         model.fit(good_X, bad_y)
 20 |                         self.fail()
 21 |                     except ValueError:
 22 |                         pass
 23 | 
 24 |         for bad_X in [normal(0, 1, [10, 10, 10])]:
 25 |             for good_y in [randint(0, 2, [10])]:
 26 |                 for model in create_classification_trees(np.array([1, 1]), 0.5):
 27 |                     try:
 28 |                         model.fit(bad_X, good_y)
 29 |                         self.fail()
 30 |                     except ValueError:
 31 |                         pass
 32 | 
 33 |     def test_cannot_predict_before_training(self):
 34 |         for model in create_classification_trees(np.array([1, 1]), 0.5):
 35 |             # can't predict yet
 36 |             try:
 37 |                 model.predict([])
 38 |                 self.fail()
 39 |             except ValueError:
 40 |                 pass
 41 | 
 42 |             # can't predict probability yet
 43 |             try:
 44 |                 model.predict_proba([])
 45 |                 self.fail()
 46 |             except ValueError:
 47 |                 pass
 48 | 
 49 |     def test_cannot_predict_with_bad_input_dimensions(self):
 50 |         for data_matrix_transform in data_matrix_transforms:
 51 |             for model in create_classification_trees(np.array([1, 1]), 0.5):
 52 |                 Xy = np.array([
 53 |                     [0.0, 0.0, 0],
 54 |                     [0.0, 1.0, 1],
 55 |                     [1.0, 1.0, 0],
 56 |                     [1.0, 0.0, 1],
 57 |                     [1.0, 0.0, 0],
 58 |                 ])
 59 |                 X = Xy[:, :-1]
 60 |                 y = Xy[:, -1]
 61 | 
 62 |                 X = data_matrix_transform(X)
 63 | 
 64 |                 print('Testing {}'.format(type(model).__name__))
 65 |                 model.fit(X, y)
 66 |                 print(model)
 67 | 
 68 |                 model.predict([[0, 0]])
 69 | 
 70 |                 try:
 71 |                     model.predict(0)
 72 |                     self.fail()
 73 |                 except ValueError:
 74 |                     pass
 75 | 
 76 |                 try:
 77 |                     model.predict([0])
 78 |                     self.fail()
 79 |                 except ValueError:
 80 |                     pass
 81 | 
 82 |                 try:
 83 |                     model.predict([0, 0, 0])
 84 |                     self.fail()
 85 |                 except ValueError:
 86 |                     pass
 87 | 
 88 |     def test_print_empty_model(self):
 89 |         for model in create_classification_trees(np.array([1, 1]), 0.5):
 90 |             print(model)
 91 | 
 92 |     def test_no_split(self):
 93 |         for data_matrix_transform in data_matrix_transforms:
 94 |             for model in create_classification_trees(np.array([1, 1]), 0.5):
 95 |                 Xy = np.array([
 96 |                     [0.0, 0, 0],
 97 |                     [0.0, 1, 1],
 98 |                     [1.0, 2, 0],
 99 |                     [1.0, 3, 1],
100 |                     [1.0, 4, 0],
101 |                 ])
102 |                 X = Xy[:, :-1]
103 |                 y = Xy[:, -1]
104 | 
105 |                 X = data_matrix_transform(X)
106 | 
107 |                 print('Testing {}'.format(type(model).__name__))
108 |                 model.fit(X, y)
109 |                 print(model)
110 | 
111 |                 self.assertEqual(model.get_depth(), 0)
112 |                 self.assertEqual(model.get_n_leaves(), 1)
113 |                 self.assertEqual(model.n_data_, 5)
114 | 
115 |                 self.assertIsNone(model.child1_)
116 |                 self.assertIsNone(model.child2_)
117 | 
118 |                 if isinstance(model, PerpendicularClassificationTree):
119 |                     self.assertEqual(model.split_dimension_, -1)
120 |                     self.assertEqual(model.split_value_, None)
121 |                 else:
122 |                     self.assertEqual(model.best_hyperplane_origin_, None)
123 |                     self.assertEqual(model.best_hyperplane_normal_, None)
124 | 
125 |                 expected = np.array([0, 0, 0, 0])
126 |                 self.assertEqual(model.predict([[0, 0]]), expected[0])
127 |                 self.assertEqual(model.predict([[0, 1]]), expected[1])
128 |                 self.assertEqual(model.predict([[1, 0]]), expected[2])
129 |                 self.assertEqual(model.predict([[1, 1]]), expected[3])
130 | 
131 |                 for data_matrix_transform2 in data_matrix_transforms:
132 |                     assert_array_equal(model.predict(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 1]])), expected)
133 | 
134 |                 expected = np.array([[4/7, 3/7], [4/7, 3/7], [4/7, 3/7], [4/7, 3/7], ])
135 |                 assert_array_almost_equal(model.predict_proba([[0, 0]]), np.expand_dims(expected[0], 0))
136 |                 assert_array_almost_equal(model.predict_proba([[0, 1]]), np.expand_dims(expected[1], 0))
137 |                 assert_array_almost_equal(model.predict_proba([[1, 0]]), np.expand_dims(expected[2], 0))
138 |                 assert_array_almost_equal(model.predict_proba([[1, 1]]), np.expand_dims(expected[3], 0))
139 | 
140 |                 for data_matrix_transform2 in data_matrix_transforms:
141 |                     assert_array_almost_equal(model.predict_proba(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 1]])), expected)
142 | 
143 |                 if isinstance(model, PerpendicularClassificationTree):
144 |                     # TODO: also add for hyperplane version
145 |                     expected_paths = [
146 |                         [],
147 |                         [],
148 |                         [],
149 |                         [],
150 |                     ]
151 |                     self.assertEqual(model.prediction_paths([[0, 0]]), [expected_paths[0]])
152 |                     self.assertEqual(model.prediction_paths([[0, 1]]), [expected_paths[1]])
153 |                     self.assertEqual(model.prediction_paths([[1, 0]]), [expected_paths[2]])
154 |                     self.assertEqual(model.prediction_paths([[1, 1]]), [expected_paths[3]])
155 | 
156 |                     for data_matrix_transform2 in data_matrix_transforms:
157 |                         self.assertEqual(model.prediction_paths(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 1]])), expected_paths)
158 | 
159 |     def test_one_split(self):
160 |         for data_matrix_transform in data_matrix_transforms:
161 |             for model in create_classification_trees(np.array([1, 1]), 0.7):
162 |                 Xy = np.array([
163 |                     [0.0, 0, 0],
164 |                     [0.1, 1, 0],
165 | 
166 |                     [0.9, 0, 1],
167 |                     [1.0, 1, 1],
168 |                 ])
169 |                 X = Xy[:, :-1]
170 |                 y = Xy[:, -1]
171 | 
172 |                 X = data_matrix_transform(X)
173 | 
174 |                 print('Testing {}'.format(type(model).__name__))
175 |                 model.fit(X, y)
176 |                 print(model)
177 | 
178 |                 self.assertEqual(model.get_depth(), 1)
179 |                 self.assertEqual(model.get_n_leaves(), 2)
180 |                 self.assertEqual(model.n_data_, 4)
181 | 
182 |                 self.assertIsNotNone(model.child1_)
183 |                 self.assertIsNone(model.child1_.child1_)
184 |                 self.assertIsNone(model.child1_.child2_)
185 |                 self.assertEqual(model.child1_.n_data_, 2)
186 | 
187 |                 self.assertIsNotNone(model.child2_)
188 |                 self.assertIsNone(model.child2_.child1_)
189 |                 self.assertIsNone(model.child2_.child2_)
190 |                 self.assertEqual(model.child1_.n_data_, 2)
191 | 
192 |                 if isinstance(model, PerpendicularClassificationTree):
193 |                     self.assertEqual(model.split_dimension_, 0)
194 |                     self.assertEqual(model.split_value_, 0.5)
195 |                 else:
196 |                     self.assertTrue(0.1 < model.best_hyperplane_origin_[0] < 0.9)
197 | 
198 |                 expected = np.array([0, 0, 1, 1])
199 |                 self.assertEqual(model.predict([[0, 0]]), expected[0])
200 |                 self.assertEqual(model.predict([[0, 1]]), expected[1])
201 |                 self.assertEqual(model.predict([[1, 0]]), expected[2])
202 |                 self.assertEqual(model.predict([[1, 1]]), expected[3])
203 | 
204 |                 for data_matrix_transform2 in data_matrix_transforms:
205 |                     assert_array_equal(model.predict(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 0]])), expected)
206 | 
207 |                 expected = np.array([[3/4, 1/4], [3/4, 1/4], [1/4, 3/4], [1/4, 3/4]])
208 |                 assert_array_almost_equal(model.predict_proba([[0, 0]]), np.expand_dims(expected[0], 0))
209 |                 assert_array_almost_equal(model.predict_proba([[0, 1]]), np.expand_dims(expected[1], 0))
210 |                 assert_array_almost_equal(model.predict_proba([[1, 0]]), np.expand_dims(expected[2], 0))
211 |                 assert_array_almost_equal(model.predict_proba([[1, 1]]), np.expand_dims(expected[3], 0))
212 | 
213 |                 for data_matrix_transform2 in data_matrix_transforms:
214 |                     assert_array_almost_equal(model.predict_proba(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 0]])), expected)
215 | 
216 |     def test_two_splits(self):
217 |         for data_matrix_transform in data_matrix_transforms:
218 |             for model in create_classification_trees(np.array([1, 1]), 0.9, prune=True):
219 |                 Xy = np.array([
220 |                     [0.0, 0.0, 0],
221 |                     [0.1, 1.0, 0],
222 |                     [0.2, 0.01, 0],
223 |                     [0.3, 0.99, 0],
224 | 
225 |                     [0.7, 0.02, 1],
226 |                     [0.8, 0.98, 1],
227 |                     [0.9, 0.03, 1],
228 |                     [1.0, 0.97, 1],
229 | 
230 |                     [2.0, 0.04, 0],
231 |                     [2.1, 0.96, 0],
232 |                 ])
233 |                 X = Xy[:, :-1]
234 |                 y = Xy[:, -1]
235 | 
236 |                 X = data_matrix_transform(X)
237 | 
238 |                 print('Testing {}'.format(type(model).__name__))
239 |                 model.fit(X, y)
240 |                 print(model)
241 | 
242 |                 if isinstance(model, PerpendicularClassificationTree):
243 |                     self.assertEqual(model.get_depth(), 2)
244 |                     self.assertEqual(model.get_n_leaves(), 3)
245 |                     self.assertEqual(model.n_data_, 10)
246 | 
247 |                     self.assertIsNotNone(model.child1_)
248 |                     self.assertEqual(model.child1_.n_data_, 4)
249 |                     self.assertIsNone(model.child1_.child1_)
250 |                     self.assertIsNone(model.child1_.child2_)
251 | 
252 |                     self.assertIsNotNone(model.child2_)
253 |                     self.assertEqual(model.child2_.n_data_, 6)
254 |                     self.assertIsNotNone(model.child2_.child1_)
255 |                     self.assertEqual(model.child2_.child1_.n_data_, 4)
256 |                     self.assertIsNotNone(model.child2_.child2_)
257 |                     self.assertEqual(model.child2_.child2_.n_data_, 2)
258 | 
259 |                     self.assertIsNone(model.child2_.child1_.child1_)
260 |                     self.assertIsNone(model.child2_.child1_.child2_)
261 |                     self.assertIsNone(model.child2_.child2_.child1_)
262 |                     self.assertIsNone(model.child2_.child2_.child2_)
263 | 
264 |                     self.assertEqual(model.split_dimension_, 0)
265 |                     self.assertEqual(model.split_value_, 0.5)
266 | 
267 |                     self.assertEqual(model.child2_.split_dimension_, 0)
268 |                     self.assertEqual(model.child2_.split_value_, 1.5)
269 |                 else:
270 |                     self.assertEqual(model.get_depth(), 2)
271 |                     self.assertEqual(model.get_n_leaves(), 3)
272 |                     self.assertEqual(model.n_data_, 10)
273 | 
274 |                     self.assertTrue(0.3 < model.best_hyperplane_origin_[0] < 0.7)
275 |                     if model.child1_.best_hyperplane_origin_ is not None:
276 |                         self.assertTrue(1.0 < model.child1_.best_hyperplane_origin_[0] < 2.0)
277 |                         self.assertEqual(model.child1_.n_data_, 6)
278 |                         self.assertEqual(model.child2_.n_data_, 4)
279 |                     else:
280 |                         self.assertTrue(1.0 < model.child2_.best_hyperplane_origin_[0] < 2.0)
281 |                         self.assertEqual(model.child1_.n_data_, 4)
282 |                         self.assertEqual(model.child2_.n_data_, 6)
283 | 
284 |                 expected = np.array([0, 0, 1, 1, 0, 0])
285 |                 self.assertEqual(model.predict([[0, 0.5]]), expected[0])
286 |                 self.assertEqual(model.predict([[0.4, 0.5]]), expected[1])
287 |                 self.assertEqual(model.predict([[0.6, 0.5]]), expected[2])
288 |                 self.assertEqual(model.predict([[1.4, 0.5]]), expected[3])
289 |                 self.assertEqual(model.predict([[1.6, 0.5]]), expected[4])
290 |                 self.assertEqual(model.predict([[100, 0.5]]), expected[5])
291 | 
292 |                 for data_matrix_transform2 in data_matrix_transforms:
293 |                     assert_array_equal(model.predict(data_matrix_transform2(
294 |                         [[0.0, 0.5], [0.4, 0.5], [0.6, 0.5], [1.4, 0.5], [1.6, 0.5], [100, 0.5]])
295 |                     ), expected)
296 | 
297 |                 expected = np.array([[5/6, 1/6], [5/6, 1/6], [1/6, 5/6], [1/6, 5/6], [3/4, 1/4], [3/4, 1/4]])
298 |                 assert_array_almost_equal(model.predict_proba([[0, 0.5]]), np.expand_dims(expected[0], 0))
299 |                 assert_array_almost_equal(model.predict_proba([[0.4, 0.5]]), np.expand_dims(expected[1], 0))
300 |                 assert_array_almost_equal(model.predict_proba([[0.6, 0.5]]), np.expand_dims(expected[2], 0))
301 |                 assert_array_almost_equal(model.predict_proba([[1.4, 0.5]]), np.expand_dims(expected[3], 0))
302 |                 assert_array_almost_equal(model.predict_proba([[1.6, 0.5]]), np.expand_dims(expected[4], 0))
303 |                 assert_array_almost_equal(model.predict_proba([[100, 0.5]]), np.expand_dims(expected[5], 0))
304 | 
305 |                 for data_matrix_transform2 in data_matrix_transforms:
306 |                     assert_array_equal(model.predict_proba(data_matrix_transform2(
307 |                         [[0.0, 0.5], [0.4, 0.5], [0.6, 0.5], [1.4, 0.5], [1.6, 0.5], [100, 0.5]])
308 |                     ), expected)
309 | 
310 |                 if isinstance(model, PerpendicularClassificationTree):
311 |                     # TODO: also add for hyperplane version
312 |                     feature_names = X.columns if isinstance(X, pd.DataFrame) else ['x{}'.format(i) for i in range(X.shape[1])]
313 |                     expected_paths = [
314 |                         [(0, feature_names[0], 0.5, False)],
315 |                         [(0, feature_names[0], 0.5, False)],
316 |                         [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, False)],
317 |                         [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, False)],
318 |                         [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, True)],
319 |                         [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, True)],
320 |                     ]
321 |                     self.assertEqual(model.prediction_paths([[0, 0.5]]), [expected_paths[0]])
322 |                     self.assertEqual(model.prediction_paths([[0.4, 0.5]]), [expected_paths[1]])
323 |                     self.assertEqual(model.prediction_paths([[0.6, 0.5]]), [expected_paths[2]])
324 |                     self.assertEqual(model.prediction_paths([[1.4, 0.5]]), [expected_paths[3]])
325 |                     self.assertEqual(model.prediction_paths([[1.6, 0.5]]), [expected_paths[4]])
326 |                     self.assertEqual(model.prediction_paths([[100, 0.5]]), [expected_paths[5]])
327 | 
328 |                     for data_matrix_transform2 in data_matrix_transforms:
329 |                         self.assertEqual(model.prediction_paths(data_matrix_transform2(
330 |                             [[0.0, 0.5], [0.4, 0.5], [0.6, 0.5], [1.4, 0.5], [1.6, 0.5], [100, 0.5]])
331 |                         ), expected_paths)
332 | 
333 |     def test_prune(self):
334 |         for model_no_prune, model_prune in zip(
335 |                 create_classification_trees(np.array([10, 10]), 0.9, prune=False),
336 |                 create_classification_trees(np.array([10, 10]), 0.9, prune=True)):
337 |             np.random.seed(666)
338 | 
339 |             X = np.vstack([
340 |                 normal(0, 1, [100, 2]),
341 |                 normal(10, 1, [100, 2]),
342 |                 normal(14, 1, [100, 2]),
343 |             ])
344 |             y = np.hstack([
345 |                 0 * np.ones(100),
346 |                 1 * np.ones(100),
347 |                 np.minimum(1, randint(0, 3, 100)),  # about two thirds should be 1's
348 |             ])
349 | 
350 |             # make sure model_no_prune finds two splits at 5 and 12 and that model_prune
351 |             # only finds one (because everything >= 5 has target 1)
352 |             model_no_prune.fit(X, y)
353 |             model_prune.fit(X, y)
354 |             self.assertEqual(model_no_prune.get_depth(), 2)
355 |             self.assertEqual(model_no_prune.get_n_leaves(), 3)
356 |             self.assertEqual(model_prune.get_depth(), 1)
357 |             self.assertEqual(model_prune.get_n_leaves(), 2)
358 | 
359 |             # now make sure the node that is the result of pruning two children is consistent
360 |             c1 = model_no_prune.child2_.child1_
361 |             c2 = model_no_prune.child2_.child2_
362 |             c12 = model_prune.child2_
363 |             assert_array_equal(c12.posterior_, c1.posterior_ + c2.posterior_ - c12.prior)
364 | 
365 |     def test_feature_importance_consistency_when_mirroring_along_axes(self):
366 |         np.random.seed(42)
367 | 
368 |         n = 200
369 |         X0 = np.zeros((n, 2))
370 |         sd = 3
371 |         X0[0*n//4:1*n//4] = np.random.normal([2, 2], sd, (n//4, 2))
372 |         X0[1*n//4:2*n//4] = np.random.normal([-2, 1], sd, (n//4, 2))
373 |         X0[2*n//4:3*n//4] = np.random.normal([-2, -1], sd, (n//4, 2))
374 |         X0[3*n//4:4*n//4] = np.random.normal([-2, -2], sd, (n//4, 2))
375 | 
376 |         y = np.zeros(n)
377 |         y[0*n//4:1*n//4] = 1
378 |         y[2*n//4:3*n//4] = 1
379 | 
380 |         for m1, m2, m3, m4 in zip(
381 |             create_classification_trees(np.array([1, 1]), 0.99, prune=True),
382 |             create_classification_trees(np.array([1, 1]), 0.99, prune=True),
383 |             create_classification_trees(np.array([1, 1]), 0.99, prune=True),
384 |             create_classification_trees(np.array([1, 1]), 0.99, prune=True)):
385 | 
386 |             X1 = np.vstack((+X0[:, 0], +X0[:, 1])).T
387 |             X2 = np.vstack((+X0[:, 0], -X0[:, 1])).T
388 |             X3 = np.vstack((-X0[:, 0], +X0[:, 1])).T
389 |             X4 = np.vstack((-X0[:, 0], -X0[:, 1])).T
390 | 
391 |             print('Testing {}'.format(type(m1).__name__))
392 | 
393 |             m1.fit(X1, y)
394 |             m2.fit(X2, y)
395 |             m3.fit(X3, y)
396 |             m4.fit(X4, y)
397 | 
398 |             fi1 = m1.feature_importance()
399 |             fi2 = m2.feature_importance()
400 |             fi3 = m3.feature_importance()
401 |             fi4 = m4.feature_importance()
402 | 
403 |             self.assertTrue(np.all(fi1 != 0))
404 |             assert_array_almost_equal(fi1, fi2, decimal=1)
405 |             assert_array_almost_equal(fi1, fi3, decimal=1)
406 |             assert_array_almost_equal(fi1, fi4, decimal=1)
407 |             assert_array_almost_equal(fi2, fi3, decimal=1)
408 |             assert_array_almost_equal(fi2, fi4, decimal=1)
409 |             assert_array_almost_equal(fi3, fi4, decimal=1)
410 | 


--------------------------------------------------------------------------------
/tests/unit/test_regression.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | import numpy as np
  4 | from numpy.testing import assert_array_equal
  5 | from sklearn.metrics import mean_squared_error
  6 | 
  7 | from bayesian_decision_tree.regression import PerpendicularRegressionTree
  8 | from tests.unit.helper import data_matrix_transforms, create_regression_trees
  9 | 
 10 | 
 11 | class RegressionTreeTest(TestCase):
 12 |     def test_cannot_predict_before_training(self):
 13 |         mu = 0
 14 |         sd_prior = 1
 15 |         prior_obs = 0.01
 16 |         kappa = prior_obs
 17 |         alpha = prior_obs/2
 18 |         var_prior = sd_prior**2
 19 |         tau_prior = 1/var_prior
 20 |         beta = alpha/tau_prior
 21 | 
 22 |         prior = np.array([mu, kappa, alpha, beta])
 23 | 
 24 |         for model in create_regression_trees(prior, 0.5):
 25 |             # can't predict yet
 26 |             try:
 27 |                 model.predict([])
 28 |                 self.fail()
 29 |             except ValueError:
 30 |                 pass
 31 | 
 32 |     def test_cannot_predict_with_bad_input_dimensions(self):
 33 |         mu = 0
 34 |         sd_prior = 1
 35 |         prior_obs = 0.01
 36 |         kappa = prior_obs
 37 |         alpha = prior_obs/2
 38 |         var_prior = sd_prior**2
 39 |         tau_prior = 1/var_prior
 40 |         beta = alpha/tau_prior
 41 | 
 42 |         prior = np.array([mu, kappa, alpha, beta])
 43 | 
 44 |         for data_matrix_transform in data_matrix_transforms:
 45 |             for model in create_regression_trees(prior, 0.5):
 46 |                 Xy = np.array([
 47 |                     [0.0, 0.0, 0.1],
 48 |                     [0.0, 1.0, 1.0],
 49 |                     [1.0, 1.0, 0.1],
 50 |                     [1.0, 0.0, 1.0],
 51 |                     [1.0, 0.0, 0.1],
 52 |                 ])
 53 |                 X = Xy[:, :-1]
 54 |                 y = Xy[:, -1]
 55 | 
 56 |                 X = data_matrix_transform(X)
 57 | 
 58 |                 print('Testing {}'.format(type(model).__name__))
 59 |                 model.fit(X, y)
 60 |                 print(model)
 61 | 
 62 |                 model.predict([[0, 0]])
 63 | 
 64 |                 try:
 65 |                     model.predict(0)
 66 |                     self.fail()
 67 |                 except ValueError:
 68 |                     pass
 69 | 
 70 |                 try:
 71 |                     model.predict([0])
 72 |                     self.fail()
 73 |                 except ValueError:
 74 |                     pass
 75 | 
 76 |                 try:
 77 |                     model.predict([0, 0, 0])
 78 |                     self.fail()
 79 |                 except ValueError:
 80 |                     pass
 81 | 
 82 |     def test_print_empty_model(self):
 83 |         for model in create_regression_trees(np.array([1, 1]), 0.5):
 84 |             print(model)
 85 | 
 86 |     def test_no_split(self):
 87 |         for data_matrix_transform in data_matrix_transforms:
 88 |             mu = 0
 89 |             sd_prior = 1
 90 |             prior_obs = 0.01
 91 |             kappa = prior_obs
 92 |             alpha = prior_obs/2
 93 |             var_prior = sd_prior**2
 94 |             tau_prior = 1/var_prior
 95 |             beta = alpha/tau_prior
 96 | 
 97 |             prior = np.array([mu, kappa, alpha, beta])
 98 | 
 99 |             for model in create_regression_trees(prior, 0.5):
100 |                 Xy = np.array([
101 |                     [0.0, 0.0, 0],
102 |                     [0.1, 0.1, 1.3],
103 |                     [0.9, 0.9, 0],
104 |                     [1.0, 1.0, 1.2],
105 |                     [1.0, 1.0, 0],
106 |                 ])
107 |                 X = Xy[:, :-1]
108 |                 y = Xy[:, -1]
109 | 
110 |                 X = data_matrix_transform(X)
111 | 
112 |                 print('Testing {}'.format(type(model).__name__))
113 |                 model.fit(X, y)
114 |                 print(model)
115 | 
116 |                 self.assertEqual(model.get_depth(), 0)
117 |                 self.assertEqual(model.get_n_leaves(), 1)
118 |                 self.assertEqual(model.n_data_, 5)
119 | 
120 |                 self.assertIsNone(model.child1_)
121 |                 self.assertIsNone(model.child2_)
122 | 
123 |                 if isinstance(model, PerpendicularRegressionTree):
124 |                     self.assertEqual(model.split_dimension_, -1)
125 |                     self.assertEqual(model.split_value_, None)
126 |                 else:
127 |                     self.assertEqual(model.best_hyperplane_origin_, None)
128 | 
129 |                 n = len(y)
130 |                 mean = y.mean()
131 |                 mu, kappa, alpha, beta = prior
132 |                 kappa_post = kappa + n
133 |                 mu_post = (kappa*mu + n*mean) / kappa_post
134 | 
135 |                 expected = np.array([mu_post, mu_post, mu_post, mu_post])
136 |                 self.assertEqual(model.predict([[0.0, 0.5]]), np.expand_dims(expected[0], 0))
137 |                 self.assertEqual(model.predict([[0.49, 0.5]]), np.expand_dims(expected[1], 0))
138 |                 self.assertEqual(model.predict([[0.51, 0.5]]), np.expand_dims(expected[2], 0))
139 |                 self.assertEqual(model.predict([[1.0, 0.5]]), np.expand_dims(expected[3], 0))
140 | 
141 |                 for data_matrix_transform2 in data_matrix_transforms:
142 |                     assert_array_equal(model.predict(data_matrix_transform2([[0.0, 0.5], [0.49, 0.5], [0.51, 0.5], [1.0, 0.5]])), expected)
143 | 
144 |     def test_decreasing_mse_for_increased_partition_prior(self):
145 |         for data_matrix_transform in data_matrix_transforms:
146 |             mu = 0
147 |             sd_prior = 1
148 |             prior_obs = 0.01
149 |             kappa = prior_obs
150 |             alpha = prior_obs/2
151 |             var_prior = sd_prior**2
152 |             tau_prior = 1/var_prior
153 |             beta = alpha/tau_prior
154 | 
155 |             prior = np.array([mu, kappa, alpha, beta])
156 | 
157 |             x = np.linspace(-np.pi/2, np.pi/2, 20)
158 |             y = np.linspace(-np.pi/2, np.pi/2, 20)
159 |             X = np.array([x, y]).T
160 |             y = np.sin(x) + 3*np.cos(y)
161 | 
162 |             X = data_matrix_transform(X)
163 | 
164 |             for i_model in range(len(create_regression_trees(prior, 0.5))):
165 |                 mse_list = []
166 |                 for partition_prior in [0.1, 0.5, 0.9, 0.99]:
167 |                     model = create_regression_trees(prior, partition_prior)[i_model]
168 |                     print('Testing {}'.format(type(model).__name__))
169 |                     model.fit(X, y)
170 |                     print(model)
171 |                     mse = mean_squared_error(y, model.predict(X))
172 |                     mse_list.append(mse)
173 | 
174 |                 self.assertTrue(mse_list[-1] < mse_list[0])
175 |                 for i in range(0, len(mse_list)-1):
176 |                     self.assertTrue(mse_list[i+1] <= mse_list[i])
177 | 


--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from unittest import TestCase
  3 | 
  4 | import numpy as np
  5 | from numpy.testing import assert_almost_equal
  6 | 
  7 | from bayesian_decision_tree.utils import hypercube_to_hypersphere_surface
  8 | 
  9 | 
 10 | class UtilsTest(TestCase):
 11 |     def test_hypercube_to_hypersphere_surface_2D_full_single_point(self):
 12 |         hc = np.array([0.2, 0.9])
 13 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False)
 14 | 
 15 |         # check dimensionality and norms
 16 |         self.assertEqual(hs.ndim, 1)
 17 |         self.assertEqual(hs.shape, (3,))
 18 |         assert_almost_equal(np.linalg.norm(hs), 1)
 19 | 
 20 |     def test_hypercube_to_hypersphere_surface_1D_full(self):
 21 |         n_points = 11
 22 |         hc = np.linspace(0, 1, n_points).reshape(-1, 1)
 23 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False)
 24 | 
 25 |         # check dimensionality and norms
 26 |         self.assertEqual(hs.ndim, 2)
 27 |         self.assertEqual(hs.shape, (n_points, 2))
 28 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
 29 | 
 30 |         # check uniformity
 31 |         expected_cos = np.dot(hs[0], hs[1])
 32 |         for i in range(1, n_points):
 33 |             cos = np.dot(hs[i-1], hs[i])
 34 |             assert_almost_equal(cos, expected_cos)
 35 | 
 36 |         cos = np.dot(hs[0], hs[-2])
 37 |         assert_almost_equal(cos, expected_cos)
 38 | 
 39 |         cos = np.dot(hs[0], hs[-1])
 40 |         assert_almost_equal(cos, 1.0)
 41 | 
 42 |     def test_hypercube_to_hypersphere_surface_1D_half(self):
 43 |         n_points = 11
 44 |         hc = np.linspace(0, 1, n_points).reshape(-1, 1)
 45 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True)
 46 | 
 47 |         # check dimensionality and norms
 48 |         self.assertEqual(hs.ndim, 2)
 49 |         self.assertEqual(hs.shape, (n_points, 2))
 50 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
 51 | 
 52 |         # check uniformity
 53 |         expected_cos = np.dot(hs[0], hs[1])
 54 |         for i in range(1, n_points):
 55 |             cos = np.dot(hs[i-1], hs[i])
 56 |             assert_almost_equal(cos, expected_cos)
 57 | 
 58 |         cos = np.dot(hs[0], hs[-2])
 59 |         assert_almost_equal(cos, -expected_cos)
 60 | 
 61 |         cos = np.dot(hs[0], hs[-1])
 62 |         assert_almost_equal(cos, -1.0)
 63 | 
 64 |     def test_hypercube_to_hypersphere_surface_2D_full(self):
 65 |         n_points_per_dim = 1000
 66 |         n_points = n_points_per_dim**2
 67 |         grid = np.linspace(0, 1, n_points_per_dim)
 68 |         x, y = np.meshgrid(grid, grid)
 69 |         hc = np.array([x.flatten(), y.flatten()]).T
 70 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False)
 71 | 
 72 |         # check dimensionality and norms
 73 |         self.assertEqual(hs.ndim, 2)
 74 |         self.assertEqual(hs.shape, (n_points, 2+1))
 75 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
 76 | 
 77 |         # make sure all quadrants contain approximately the same number of data points
 78 |         tolerance_fraction = 0.01
 79 |         for quadrant_signs in itertools.product([-1, 1], [-1, 1], [-1, 1]):
 80 |             in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum()
 81 |             min = n_points / 2**(2+1) * (1-tolerance_fraction)
 82 |             max = n_points / 2**(2+1) * (1+tolerance_fraction)
 83 |             msg = f'Expected a value between {min:.0f} and {max:.0f}, but was {in_quadrant}'
 84 |             self.assertTrue(min <= np.sum(in_quadrant) <= max, msg=msg)
 85 | 
 86 |     def test_hypercube_to_hypersphere_surface_2D_half(self):
 87 |         n_points_per_dim = 1000
 88 |         n_points = n_points_per_dim**2
 89 |         grid = np.linspace(0, 1, n_points_per_dim)
 90 |         x, y = np.meshgrid(grid, grid)
 91 |         hc = np.array([x.flatten(), y.flatten()]).T
 92 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True)
 93 | 
 94 |         # check dimensionality and norms
 95 |         self.assertEqual(hs.ndim, 2)
 96 |         self.assertEqual(hs.shape, (n_points, 2+1))
 97 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
 98 | 
 99 |         # make sure all quadrants contain approximately the same number of data points
100 |         tolerance_fraction = 0.01
101 |         for quadrant_signs in itertools.product([-1, 1], [-1, 1], [-1, 1]):
102 |             in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum()
103 |             if quadrant_signs[0] == -1:
104 |                 self.assertEqual(np.sum(in_quadrant), 0)
105 |             else:
106 |                 min = n_points / 2**2 * (1-tolerance_fraction)
107 |                 max = n_points / 2**2 * (1+tolerance_fraction)
108 |                 msg = f'Expected a value between {min:.0f} and {max:.0f} in quadrant {quadrant_signs}, but was {in_quadrant}'
109 |                 self.assertTrue(min <= np.sum(in_quadrant) <= max, msg)
110 | 
111 |     def test_hypercube_to_hypersphere_surface_5D_full(self):
112 |         n_points = 1_000_000
113 |         np.random.seed(666)
114 |         hc = np.random.uniform(0, 1, (n_points, 5))
115 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False)
116 |         # hs = np.random.normal(0, 1, hs.shape)
117 | 
118 |         # check dimensionality and norms
119 |         self.assertEqual(hs.ndim, 2)
120 |         self.assertEqual(hs.shape, (n_points, 5+1))
121 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
122 | 
123 |         # make sure all quadrants contain approximately the same number of data points
124 |         tolerance_fraction = 0.02
125 |         for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (5+1, 1)))):
126 |             in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum()
127 |             min = n_points / 2**(5+1) * (1-tolerance_fraction)
128 |             max = n_points / 2**(5+1) * (1+tolerance_fraction)
129 |             msg = f'Expected a value between {min:.0f} and {max:.0f}, but was {in_quadrant}'
130 |             self.assertTrue(min <= np.sum(in_quadrant) <= max, msg=msg)
131 | 
132 |     def test_hypercube_to_hypersphere_surface_5D_half(self):
133 |         n_points = 1_000_000
134 |         np.random.seed(666)
135 |         hc = np.random.uniform(0, 1, (n_points, 5))
136 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True)
137 | 
138 |         # check dimensionality and norms
139 |         self.assertEqual(hs.ndim, 2)
140 |         self.assertEqual(hs.shape, (n_points, 5+1))
141 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
142 | 
143 |         # make sure all quadrants contain approximately the same number of data points
144 |         tolerance_fraction = 0.01
145 |         for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (5+1, 1)))):
146 |             in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum()
147 |             if quadrant_signs[0] == -1:
148 |                 self.assertEqual(np.sum(in_quadrant), 0)
149 |             else:
150 |                 min = n_points / 2**5 * (1-tolerance_fraction)
151 |                 max = n_points / 2**5 * (1+tolerance_fraction)
152 |                 msg = f'Expected a value between {min:.0f} and {max:.0f} in quadrant {quadrant_signs}, but was {in_quadrant}'
153 |                 self.assertTrue(min <= np.sum(in_quadrant) <= max, msg)
154 | 
155 |     def test_hypercube_to_hypersphere_surface_6D_full(self):
156 |         n_points = 1_000_000
157 |         np.random.seed(666)
158 |         hc = np.random.uniform(0, 1, (n_points, 6))
159 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False)
160 |         # hs = np.random.normal(0, 1, hs.shape)
161 | 
162 |         # check dimensionality and norms
163 |         self.assertEqual(hs.ndim, 2)
164 |         self.assertEqual(hs.shape, (n_points, 6+1))
165 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
166 | 
167 |         # make sure all quadrants contain approximately the same number of data points
168 |         tolerance_fraction = 0.03
169 |         for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (6+1, 1)))):
170 |             in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum()
171 |             min = n_points / 2**(6+1) * (1-tolerance_fraction)
172 |             max = n_points / 2**(6+1) * (1+tolerance_fraction)
173 |             msg = f'Expected a value between {min:.0f} and {max:.0f}, but was {in_quadrant}'
174 |             self.assertTrue(min <= np.sum(in_quadrant) <= max, msg=msg)
175 | 
176 |     def test_hypercube_to_hypersphere_surface_6D_half(self):
177 |         n_points = 1_000_000
178 |         np.random.seed(666)
179 |         hc = np.random.uniform(0, 1, (n_points, 6))
180 |         hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True)
181 | 
182 |         # check dimensionality and norms
183 |         self.assertEqual(hs.ndim, 2)
184 |         self.assertEqual(hs.shape, (n_points, 6+1))
185 |         assert_almost_equal(np.linalg.norm(hs, axis=1), 1)
186 | 
187 |         # make sure all quadrants contain approximately the same number of data points
188 |         tolerance_fraction = 0.03
189 |         for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (6+1, 1)))):
190 |             in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum()
191 |             if quadrant_signs[0] == -1:
192 |                 self.assertEqual(np.sum(in_quadrant), 0)
193 |             else:
194 |                 min = n_points / 2**6 * (1-tolerance_fraction)
195 |                 max = n_points / 2**6 * (1+tolerance_fraction)
196 |                 msg = f'Expected a value between {min:.0f} and {max:.0f} in quadrant {quadrant_signs}, but was {in_quadrant}'
197 |                 self.assertTrue(min <= np.sum(in_quadrant) <= max, msg)
198 | 


--------------------------------------------------------------------------------