├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── NOTICE ├── README.md ├── bayesian_decision_tree ├── __init__.py ├── _version.py ├── base.py ├── base_hyperplane.py ├── base_perpendicular.py ├── classification.py ├── hyperplane_optimization.py ├── regression.py └── utils.py ├── conda.recipe ├── conda_build_config.yaml └── meta.yaml ├── examples ├── __init__.py ├── demo_classification_hyperplane.py ├── demo_classification_perpendicular.py ├── demo_classification_trading.py ├── demo_regression_hyperplane.py ├── demo_regression_perpendicular.py └── helper.py ├── mypy.ini ├── setup.cfg ├── setup.py ├── tests ├── __init__.py └── unit │ ├── __init__.py │ ├── helper.py │ ├── test_classification.py │ ├── test_regression.py │ └── test_utils.py └── versioneer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # OSX useful to ignore 7 | *.DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear in the root of a volume 15 | .DocumentRevisions-V100 16 | .fseventsd 17 | .Spotlight-V100 18 | .TemporaryItems 19 | .Trashes 20 | .VolumeIcon.icns 21 | .com.apple.timemachine.donotpresent 22 | 23 | # Directories potentially created on remote AFP share 24 | .AppleDB 25 | .AppleDesktop 26 | Network Trash Folder 27 | Temporary Items 28 | .apdisk 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | env/ 36 | build/ 37 | develop-eggs/ 38 | dist/ 39 | downloads/ 40 | eggs/ 41 | .eggs/ 42 | lib/ 43 | lib64/ 44 | parts/ 45 | sdist/ 46 | var/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *,cover 70 | .hypothesis/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # IntelliJ Idea family of suites 83 | .idea 84 | *.iml 85 | ## File-based project format: 86 | *.ipr 87 | *.iws 88 | ## mpeltonen/sbt-idea plugin 89 | .idea_modules/ 90 | 91 | # PyBuilder 92 | target/ 93 | 94 | # Cookiecutter 95 | output/ 96 | python_boilerplate/ 97 | 98 | .idea 99 | target 100 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/) 5 | and this project adheres to [Semantic Versioning](https://semver.org/). 6 | 7 | ## [0.7] - 2020-06-211 8 | ### Changed 9 | - Improve prediction performance 10 | - Improve training performance and memory use 11 | 12 | ## [0.6] - 2020-05-28 13 | ### Added 14 | - New method `_get_raw_leaf_data()` to access leaf internals 15 | 16 | ## [0.5] - 2020-05-26 17 | ### Changed 18 | - Removed references to `SparseDataFrame` (deprecated) 19 | 20 | ### Fixed 21 | - Bug in regression code 22 | 23 | ## [0.4] - 2020-03-09 24 | ### Changed 25 | - Better hyperplane tree plotting 26 | 27 | ### Fixed 28 | - Bug in hyperplane trees (tried to access uninitialized field) 29 | 30 | ## [0.3] - 2020-02-26 31 | ### Added 32 | - Improved scikit-learn compatibility further 33 | 34 | ### Fixed 35 | - Bug in `model.feature_importance()` computation 36 | 37 | ## [0.2] - 2019-09-02 38 | ### Added 39 | - Experimental support for arbitrarily-oriented hyperplane splits rather than axis-perpendicular ones only 40 | - Experimental support for sparse DataFrames and sparse matrices (scipy.sparse) for fitting and prediction 41 | - Added `model.feature_importance()` for feature selection 42 | - All models now compatible with scikit-learn models 43 | 44 | ### Changed 45 | - Lots of small changes here and there 46 | 47 | ## [0.1] - 2019-02-06 48 | ### Added 49 | - First release 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2019 UBS Limited 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Bayesian Tree 2 | Copyright 2018-2019 UBS AG 3 | 4 | This product includes software developed at 5 | UBS AG (https://www.ubs.com) 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Bayesian Decision Tree Algorithm 2 | This is an implementation of the paper: [A Bayesian Decision Tree Algorithm](https://arxiv.org/abs/1901.03214) by Nuti et al. 3 | 4 | ## Feature Support 5 | 6 | This package implements: 7 | * Classification (binary and multiclass) 8 | * Regression 9 | * Both models are available in two versions respectively: 10 | * **Perpendicular Trees**: 11 | The classic decision/regression tree structure with splits along a single 12 | feature dimension (i.e., _perpendicular_ to a feature dimension axis), 13 | analogous to e.g. the scikit-learn 14 | [decision](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) 15 | and 16 | [regression](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html) 17 | trees. 18 | 19 | The models are called 20 | [`PerpendicularClassificationTree`](bayesian_decision_tree/classification.py) 21 | and 22 | [`PerpendicularRegressionTree`](bayesian_decision_tree/regression.py). 23 | 24 | * **Hyperplane Trees**: 25 | Decision/regression trees using _arbitrarily-oriented hyperplanes_. These models 26 | are more flexible than perpendicular trees as they cover a much larger search 27 | space to naturally make use of correlations between features. 28 | 29 | All else equal, hyperplane trees typically lead to shallower trees with fewer 30 | leaf nodes compared to their perpendicular counterparts because they can employ 31 | more than just a single feature dimension per split. This can lead to less 32 | overfitting and better generalization performance, but no such guarantees exist 33 | because hyperplane trees are still being constructed in a greedy manner. 34 | 35 | Note that hyperplane trees take much longer to train and need to be trained 36 | stochastically using global optimizers due to the exponentially large search 37 | space. 38 | 39 | The models are called 40 | [`HyperplaneClassificationTree`](bayesian_decision_tree/classification.py) 41 | and 42 | [`HyperplaneRegressionTree`](bayesian_decision_tree/regression.py). 43 | 44 | ## Installation 45 | 46 | To install you can either use _conda_ or _pip_: 47 | 48 | #### Conda 49 | ``` 50 | git clone https://github.com/UBS-IB/bayesian_tree 51 | cd bayesian_tree 52 | conda build conda.recipe 53 | conda install --use-local bayesian_decision_tree 54 | ``` 55 | 56 | #### PIP 57 | ``` 58 | git clone https://github.com/UBS-IB/bayesian_tree 59 | cd bayesian_tree 60 | pip install -e . 61 | ``` 62 | 63 | ## Usage 64 | 65 | We include some examples for various uses in the [examples](examples) directory. 66 | The models are fully compatible with scikit-learn, so you can use them for e.g. 67 | cross-validation or performance evaluation using scikit-learn functions. 68 | 69 | ## TODO 70 | - Add parallelization option (dask) 71 | -------------------------------------------------------------------------------- /bayesian_decision_tree/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import get_versions 2 | __version__ = get_versions()['version'] 3 | del get_versions 4 | -------------------------------------------------------------------------------- /bayesian_decision_tree/_version.py: -------------------------------------------------------------------------------- 1 | 2 | # This file helps to compute a version number in source trees obtained from 3 | # git-archive tarball (such as those provided by githubs download-from-tag 4 | # feature). Distribution tarballs (built by setup.py sdist) and build 5 | # directories (produced by setup.py build) will contain a much shorter file 6 | # that just contains the computed version number. 7 | 8 | # This file is released into the public domain. Generated by 9 | # versioneer-0.18 (https://github.com/warner/python-versioneer) 10 | 11 | """Git implementation of _version.py.""" 12 | 13 | import errno 14 | import os 15 | import re 16 | import subprocess 17 | import sys 18 | 19 | 20 | def get_keywords(): 21 | """Get the keywords needed to look up the version information.""" 22 | # these strings will be replaced by git during git-archive. 23 | # setup.py/versioneer.py will grep for the variable names, so they must 24 | # each be defined on a line of their own. _version.py will just call 25 | # get_keywords(). 26 | git_refnames = "$Format:%d$" 27 | git_full = "$Format:%H$" 28 | git_date = "$Format:%ci$" 29 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} 30 | return keywords 31 | 32 | 33 | class VersioneerConfig: 34 | """Container for Versioneer configuration parameters.""" 35 | 36 | 37 | def get_config(): 38 | """Create, populate and return the VersioneerConfig() object.""" 39 | # these strings are filled in when 'setup.py versioneer' creates 40 | # _version.py 41 | cfg = VersioneerConfig() 42 | cfg.VCS = "git" 43 | cfg.style = "" 44 | cfg.tag_prefix = "" 45 | cfg.parentdir_prefix = "hadoop-utils-" 46 | cfg.versionfile_source = "hadoop_utils/_version.py" 47 | cfg.verbose = False 48 | return cfg 49 | 50 | 51 | class NotThisMethod(Exception): 52 | """Exception raised if a method is not valid for the current scenario.""" 53 | 54 | 55 | LONG_VERSION_PY = {} 56 | HANDLERS = {} 57 | 58 | 59 | def register_vcs_handler(vcs, method): # decorator 60 | """Decorator to mark a method as the handler for a particular VCS.""" 61 | def decorate(f): 62 | """Store f in HANDLERS[vcs][method].""" 63 | if vcs not in HANDLERS: 64 | HANDLERS[vcs] = {} 65 | HANDLERS[vcs][method] = f 66 | return f 67 | return decorate 68 | 69 | 70 | def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, 71 | env=None): 72 | """Call the given command(s).""" 73 | assert isinstance(commands, list) 74 | p = None 75 | for c in commands: 76 | try: 77 | dispcmd = str([c] + args) 78 | # remember shell=False, so use git.cmd on windows, not just git 79 | p = subprocess.Popen([c] + args, cwd=cwd, env=env, 80 | stdout=subprocess.PIPE, 81 | stderr=(subprocess.PIPE if hide_stderr 82 | else None)) 83 | break 84 | except EnvironmentError: 85 | e = sys.exc_info()[1] 86 | if e.errno == errno.ENOENT: 87 | continue 88 | if verbose: 89 | print("unable to run %s" % dispcmd) 90 | print(e) 91 | return None, None 92 | else: 93 | if verbose: 94 | print("unable to find command, tried %s" % (commands,)) 95 | return None, None 96 | stdout = p.communicate()[0].strip() 97 | if sys.version_info[0] >= 3: 98 | stdout = stdout.decode() 99 | if p.returncode != 0: 100 | if verbose: 101 | print("unable to run %s (error)" % dispcmd) 102 | print("stdout was %s" % stdout) 103 | return None, p.returncode 104 | return stdout, p.returncode 105 | 106 | 107 | def versions_from_parentdir(parentdir_prefix, root, verbose): 108 | """Try to determine the version from the parent directory name. 109 | 110 | Source tarballs conventionally unpack into a directory that includes both 111 | the project name and a version string. We will also support searching up 112 | two directory levels for an appropriately named parent directory 113 | """ 114 | rootdirs = [] 115 | 116 | for i in range(3): 117 | dirname = os.path.basename(root) 118 | if dirname.startswith(parentdir_prefix): 119 | return {"version": dirname[len(parentdir_prefix):], 120 | "full-revisionid": None, 121 | "dirty": False, "error": None, "date": None} 122 | else: 123 | rootdirs.append(root) 124 | root = os.path.dirname(root) # up a level 125 | 126 | if verbose: 127 | print("Tried directories %s but none started with prefix %s" % 128 | (str(rootdirs), parentdir_prefix)) 129 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 130 | 131 | 132 | @register_vcs_handler("git", "get_keywords") 133 | def git_get_keywords(versionfile_abs): 134 | """Extract version information from the given file.""" 135 | # the code embedded in _version.py can just fetch the value of these 136 | # keywords. When used from setup.py, we don't want to import _version.py, 137 | # so we do it with a regexp instead. This function is not used from 138 | # _version.py. 139 | keywords = {} 140 | try: 141 | f = open(versionfile_abs, "r") 142 | for line in f.readlines(): 143 | if line.strip().startswith("git_refnames ="): 144 | mo = re.search(r'=\s*"(.*)"', line) 145 | if mo: 146 | keywords["refnames"] = mo.group(1) 147 | if line.strip().startswith("git_full ="): 148 | mo = re.search(r'=\s*"(.*)"', line) 149 | if mo: 150 | keywords["full"] = mo.group(1) 151 | if line.strip().startswith("git_date ="): 152 | mo = re.search(r'=\s*"(.*)"', line) 153 | if mo: 154 | keywords["date"] = mo.group(1) 155 | f.close() 156 | except EnvironmentError: 157 | pass 158 | return keywords 159 | 160 | 161 | @register_vcs_handler("git", "keywords") 162 | def git_versions_from_keywords(keywords, tag_prefix, verbose): 163 | """Get version information from git keywords.""" 164 | if not keywords: 165 | raise NotThisMethod("no keywords at all, weird") 166 | date = keywords.get("date") 167 | if date is not None: 168 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 169 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 170 | # -like" string, which we must then edit to make compliant), because 171 | # it's been around since git-1.5.3, and it's too difficult to 172 | # discover which version we're using, or to work around using an 173 | # older one. 174 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 175 | refnames = keywords["refnames"].strip() 176 | if refnames.startswith("$Format"): 177 | if verbose: 178 | print("keywords are unexpanded, not using") 179 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") 180 | refs = set([r.strip() for r in refnames.strip("()").split(",")]) 181 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 182 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 183 | TAG = "tag: " 184 | tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) 185 | if not tags: 186 | # Either we're using git < 1.8.3, or there really are no tags. We use 187 | # a heuristic: assume all version tags have a digit. The old git %d 188 | # expansion behaves like git log --decorate=short and strips out the 189 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 190 | # between branches and tags. By ignoring refnames without digits, we 191 | # filter out many common branch names like "release" and 192 | # "stabilization", as well as "HEAD" and "master". 193 | tags = set([r for r in refs if re.search(r'\d', r)]) 194 | if verbose: 195 | print("discarding '%s', no digits" % ",".join(refs - tags)) 196 | if verbose: 197 | print("likely tags: %s" % ",".join(sorted(tags))) 198 | for ref in sorted(tags): 199 | # sorting will prefer e.g. "2.0" over "2.0rc1" 200 | if ref.startswith(tag_prefix): 201 | r = ref[len(tag_prefix):] 202 | if verbose: 203 | print("picking %s" % r) 204 | return {"version": r, 205 | "full-revisionid": keywords["full"].strip(), 206 | "dirty": False, "error": None, 207 | "date": date} 208 | # no suitable tags, so version is "0+unknown", but full hex is still there 209 | if verbose: 210 | print("no suitable tags, using unknown + full revision id") 211 | return {"version": "0+unknown", 212 | "full-revisionid": keywords["full"].strip(), 213 | "dirty": False, "error": "no suitable tags", "date": None} 214 | 215 | 216 | @register_vcs_handler("git", "pieces_from_vcs") 217 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): 218 | """Get version from 'git describe' in the root of the source tree. 219 | 220 | This only gets called if the git-archive 'subst' keywords were *not* 221 | expanded, and _version.py hasn't already been rewritten with a short 222 | version string, meaning we're inside a checked out source tree. 223 | """ 224 | GITS = ["git"] 225 | if sys.platform == "win32": 226 | GITS = ["git.cmd", "git.exe"] 227 | 228 | out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, 229 | hide_stderr=True) 230 | if rc != 0: 231 | if verbose: 232 | print("Directory %s not under git control" % root) 233 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 234 | 235 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 236 | # if there isn't one, this yields HEX[-dirty] (no NUM) 237 | describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", 238 | "--always", "--long", 239 | "--match", "%s*" % tag_prefix], 240 | cwd=root) 241 | # --long was added in git-1.5.5 242 | if describe_out is None: 243 | raise NotThisMethod("'git describe' failed") 244 | describe_out = describe_out.strip() 245 | full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) 246 | if full_out is None: 247 | raise NotThisMethod("'git rev-parse' failed") 248 | full_out = full_out.strip() 249 | 250 | pieces = {} 251 | pieces["long"] = full_out 252 | pieces["short"] = full_out[:7] # maybe improved later 253 | pieces["error"] = None 254 | 255 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 256 | # TAG might have hyphens. 257 | git_describe = describe_out 258 | 259 | # look for -dirty suffix 260 | dirty = git_describe.endswith("-dirty") 261 | pieces["dirty"] = dirty 262 | if dirty: 263 | git_describe = git_describe[:git_describe.rindex("-dirty")] 264 | 265 | # now we have TAG-NUM-gHEX or HEX 266 | 267 | if "-" in git_describe: 268 | # TAG-NUM-gHEX 269 | mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) 270 | if not mo: 271 | # unparseable. Maybe git-describe is misbehaving? 272 | pieces["error"] = ("unable to parse git-describe output: '%s'" 273 | % describe_out) 274 | return pieces 275 | 276 | # tag 277 | full_tag = mo.group(1) 278 | if not full_tag.startswith(tag_prefix): 279 | if verbose: 280 | fmt = "tag '%s' doesn't start with prefix '%s'" 281 | print(fmt % (full_tag, tag_prefix)) 282 | pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" 283 | % (full_tag, tag_prefix)) 284 | return pieces 285 | pieces["closest-tag"] = full_tag[len(tag_prefix):] 286 | 287 | # distance: number of commits since tag 288 | pieces["distance"] = int(mo.group(2)) 289 | 290 | # commit: short hex revision ID 291 | pieces["short"] = mo.group(3) 292 | 293 | else: 294 | # HEX: no tags 295 | pieces["closest-tag"] = None 296 | count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], 297 | cwd=root) 298 | pieces["distance"] = int(count_out) # total number of commits 299 | 300 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 301 | date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], 302 | cwd=root)[0].strip() 303 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 304 | 305 | return pieces 306 | 307 | 308 | def plus_or_dot(pieces): 309 | """Return a + if we don't already have one, else return a .""" 310 | if "+" in pieces.get("closest-tag", ""): 311 | return "." 312 | return "+" 313 | 314 | 315 | def render_pep440(pieces): 316 | """Build up version string, with post-release "local version identifier". 317 | 318 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 319 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 320 | 321 | Exceptions: 322 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 323 | """ 324 | if pieces["closest-tag"]: 325 | rendered = pieces["closest-tag"] 326 | if pieces["distance"] or pieces["dirty"]: 327 | rendered += plus_or_dot(pieces) 328 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 329 | if pieces["dirty"]: 330 | rendered += ".dirty" 331 | else: 332 | # exception #1 333 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], 334 | pieces["short"]) 335 | if pieces["dirty"]: 336 | rendered += ".dirty" 337 | return rendered 338 | 339 | 340 | def render_pep440_pre(pieces): 341 | """TAG[.post.devDISTANCE] -- No -dirty. 342 | 343 | Exceptions: 344 | 1: no tags. 0.post.devDISTANCE 345 | """ 346 | if pieces["closest-tag"]: 347 | rendered = pieces["closest-tag"] 348 | if pieces["distance"]: 349 | rendered += ".post.dev%d" % pieces["distance"] 350 | else: 351 | # exception #1 352 | rendered = "0.post.dev%d" % pieces["distance"] 353 | return rendered 354 | 355 | 356 | def render_pep440_post(pieces): 357 | """TAG[.postDISTANCE[.dev0]+gHEX] . 358 | 359 | The ".dev0" means dirty. Note that .dev0 sorts backwards 360 | (a dirty tree will appear "older" than the corresponding clean one), 361 | but you shouldn't be releasing software with -dirty anyways. 362 | 363 | Exceptions: 364 | 1: no tags. 0.postDISTANCE[.dev0] 365 | """ 366 | if pieces["closest-tag"]: 367 | rendered = pieces["closest-tag"] 368 | if pieces["distance"] or pieces["dirty"]: 369 | rendered += ".post%d" % pieces["distance"] 370 | if pieces["dirty"]: 371 | rendered += ".dev0" 372 | rendered += plus_or_dot(pieces) 373 | rendered += "g%s" % pieces["short"] 374 | else: 375 | # exception #1 376 | rendered = "0.post%d" % pieces["distance"] 377 | if pieces["dirty"]: 378 | rendered += ".dev0" 379 | rendered += "+g%s" % pieces["short"] 380 | return rendered 381 | 382 | 383 | def render_pep440_old(pieces): 384 | """TAG[.postDISTANCE[.dev0]] . 385 | 386 | The ".dev0" means dirty. 387 | 388 | Eexceptions: 389 | 1: no tags. 0.postDISTANCE[.dev0] 390 | """ 391 | if pieces["closest-tag"]: 392 | rendered = pieces["closest-tag"] 393 | if pieces["distance"] or pieces["dirty"]: 394 | rendered += ".post%d" % pieces["distance"] 395 | if pieces["dirty"]: 396 | rendered += ".dev0" 397 | else: 398 | # exception #1 399 | rendered = "0.post%d" % pieces["distance"] 400 | if pieces["dirty"]: 401 | rendered += ".dev0" 402 | return rendered 403 | 404 | 405 | def render_git_describe(pieces): 406 | """TAG[-DISTANCE-gHEX][-dirty]. 407 | 408 | Like 'git describe --tags --dirty --always'. 409 | 410 | Exceptions: 411 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 412 | """ 413 | if pieces["closest-tag"]: 414 | rendered = pieces["closest-tag"] 415 | if pieces["distance"]: 416 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 417 | else: 418 | # exception #1 419 | rendered = pieces["short"] 420 | if pieces["dirty"]: 421 | rendered += "-dirty" 422 | return rendered 423 | 424 | 425 | def render_git_describe_long(pieces): 426 | """TAG-DISTANCE-gHEX[-dirty]. 427 | 428 | Like 'git describe --tags --dirty --always -long'. 429 | The distance/hash is unconditional. 430 | 431 | Exceptions: 432 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 433 | """ 434 | if pieces["closest-tag"]: 435 | rendered = pieces["closest-tag"] 436 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 437 | else: 438 | # exception #1 439 | rendered = pieces["short"] 440 | if pieces["dirty"]: 441 | rendered += "-dirty" 442 | return rendered 443 | 444 | 445 | def render(pieces, style): 446 | """Render the given version pieces into the requested style.""" 447 | if pieces["error"]: 448 | return {"version": "unknown", 449 | "full-revisionid": pieces.get("long"), 450 | "dirty": None, 451 | "error": pieces["error"], 452 | "date": None} 453 | 454 | if not style or style == "default": 455 | style = "pep440" # the default 456 | 457 | if style == "pep440": 458 | rendered = render_pep440(pieces) 459 | elif style == "pep440-pre": 460 | rendered = render_pep440_pre(pieces) 461 | elif style == "pep440-post": 462 | rendered = render_pep440_post(pieces) 463 | elif style == "pep440-old": 464 | rendered = render_pep440_old(pieces) 465 | elif style == "git-describe": 466 | rendered = render_git_describe(pieces) 467 | elif style == "git-describe-long": 468 | rendered = render_git_describe_long(pieces) 469 | else: 470 | raise ValueError("unknown style '%s'" % style) 471 | 472 | return {"version": rendered, "full-revisionid": pieces["long"], 473 | "dirty": pieces["dirty"], "error": None, 474 | "date": pieces.get("date")} 475 | 476 | 477 | def get_versions(): 478 | """Get version information or return default if unable to do so.""" 479 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 480 | # __file__, we can work backwards from there to the root. Some 481 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 482 | # case we can only use expanded keywords. 483 | 484 | cfg = get_config() 485 | verbose = cfg.verbose 486 | 487 | try: 488 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, 489 | verbose) 490 | except NotThisMethod: 491 | pass 492 | 493 | try: 494 | root = os.path.realpath(__file__) 495 | # versionfile_source is the relative path from the top of the source 496 | # tree (where the .git directory might live) to this file. Invert 497 | # this to find the root from __file__. 498 | for i in cfg.versionfile_source.split('/'): 499 | root = os.path.dirname(root) 500 | except NameError: 501 | return {"version": "0+unknown", "full-revisionid": None, 502 | "dirty": None, 503 | "error": "unable to find root of source tree", 504 | "date": None} 505 | 506 | try: 507 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 508 | return render(pieces, cfg.style) 509 | except NotThisMethod: 510 | pass 511 | 512 | try: 513 | if cfg.parentdir_prefix: 514 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 515 | except NotThisMethod: 516 | pass 517 | 518 | return {"version": "0+unknown", "full-revisionid": None, 519 | "dirty": None, 520 | "error": "unable to compute version", "date": None} 521 | -------------------------------------------------------------------------------- /bayesian_decision_tree/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from scipy.sparse import csr_matrix, csc_matrix 6 | from sklearn.base import BaseEstimator 7 | 8 | 9 | class BaseTree(ABC, BaseEstimator): 10 | """ 11 | Abstract base class of all Bayesian decision tree models (classification and regression). Performs all 12 | high-level fitting and prediction tasks and outsources the medium- and low-level work to subclasses. 13 | 14 | Implementation note: This class hierarchy is diamond-shaped: The four concrete model classes each 15 | inherit from two superclasses which in turn inherit from this class. 16 | """ 17 | 18 | def __init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level): 19 | self.partition_prior = partition_prior 20 | self.prior = prior 21 | self.delta = delta 22 | self.prune = prune 23 | self.child_type = child_type 24 | self.is_regression = is_regression 25 | self.split_precision = split_precision 26 | self.level = level 27 | 28 | def fit(self, X, y, verbose=False, feature_names=None): 29 | """ 30 | Trains this classification or regression tree using the training set (X, y). 31 | 32 | Parameters 33 | ---------- 34 | X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features] 35 | The training input samples. 36 | 37 | y : array-like, shape = [n_samples] or [n_samples, n_outputs] 38 | The target values. In case of binary classification only the 39 | integers 0 and 1 are permitted. In case of multiclass classification 40 | only the integers 0, 1, ..., {n_classes-1} are permitted. In case of 41 | regression all finite float values are permitted. 42 | 43 | verbose : bool, default=False 44 | Prints fitting progress. 45 | 46 | feature_names: array-lie, shape = [n_features] 47 | An optional sequence of feature names. If not provided then 'x0', 'x1', ... is used 48 | if X is a matrix, or the column headers if X is a DataFrame. 49 | 50 | References 51 | ---------- 52 | 53 | .. [1] https://arxiv.org/abs/1901.03214 54 | """ 55 | 56 | # validation and input transformation 57 | if isinstance(y, list): 58 | y = np.array(y) 59 | 60 | y = y.squeeze() 61 | y = self._ensure_float64(y) 62 | self._check_target(y) 63 | 64 | X, feature_names = self._normalize_data_and_feature_names(X, feature_names) 65 | if X.shape[0] != len(y): 66 | raise ValueError('Invalid shapes: X={}, y={}'.format(X.shape, y.shape)) 67 | 68 | # fit 69 | self._fit(X, y, verbose, feature_names, 'root') 70 | 71 | if self.prune: 72 | self._prune() 73 | 74 | return self 75 | 76 | def predict(self, X): 77 | """Predict class or regression value for X. 78 | 79 | For a classification model, the predicted class for each sample in X is 80 | returned. For a regression model, the predicted value based on X is 81 | returned. 82 | 83 | Parameters 84 | ---------- 85 | X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features] 86 | The input samples. 87 | 88 | Returns 89 | ------- 90 | y : array of shape = [n_samples] 91 | The predicted classes, or the predict values. 92 | """ 93 | 94 | # input transformation and checks 95 | X, _ = self._normalize_data_and_feature_names(X) 96 | self._ensure_is_fitted(X) 97 | 98 | indices = np.arange(X.shape[0]) 99 | y = np.zeros(X.shape[0]) 100 | self._predict(X, indices, True, y) 101 | 102 | return y 103 | 104 | def feature_importance(self): 105 | """ 106 | Compute and return feature importance of this tree after having fitted it to data. Feature 107 | importance for a given feature dimension is defined as the sum of all increases in the 108 | marginal data log-likelihood across splits of that dimension. Finally, the feature 109 | importance vector is normalized to sum to 1. 110 | 111 | Returns 112 | ------- 113 | feature_importance: array of floats 114 | The feature importance. 115 | """ 116 | 117 | self._ensure_is_fitted() 118 | 119 | feature_importance = np.zeros(self.n_dim_) 120 | self._update_feature_importance(feature_importance) 121 | feature_importance /= feature_importance.sum() 122 | 123 | return feature_importance 124 | 125 | def _predict(self, X, indices, predict_class, y): 126 | if self.is_leaf(): 127 | prediction = self._get_raw_leaf_data_internal() if predict_class is None \ 128 | else self._predict_leaf() if predict_class \ 129 | else self._compute_posterior_mean() 130 | for i in indices: 131 | y[i] = prediction 132 | else: 133 | dense = isinstance(X, np.ndarray) 134 | if not dense and isinstance(X, csr_matrix): 135 | # column accesses coming up, so convert to CSC sparse matrix format 136 | X = csc_matrix(X) 137 | 138 | # query both children, let them predict their side, and then re-assemble 139 | indices1, indices2 = self._compute_child1_and_child2_indices(X, indices, dense) 140 | 141 | if len(indices1) > 0: 142 | self.child1_._predict(X, indices[indices1], predict_class, y) 143 | 144 | if len(indices2) > 0: 145 | self.child2_._predict(X, indices[indices2], predict_class, y) 146 | 147 | def _prune(self): 148 | if self.is_leaf(): 149 | return 150 | 151 | depth_start = self.get_depth() 152 | n_leaves_start = self.get_n_leaves() 153 | 154 | if self.child1_.is_leaf() and self.child2_.is_leaf(): 155 | if self.child1_._predict_leaf() == self.child2_._predict_leaf(): 156 | # same prediction (class if classification, value if regression) -> no need to split 157 | self._erase_split_info_base() 158 | self._erase_split_info() 159 | else: 160 | self.child1_._prune() 161 | self.child2_._prune() 162 | 163 | if depth_start != self.get_depth() or n_leaves_start != self.get_n_leaves(): 164 | # we did some pruning somewhere down this sub-tree -> prune again 165 | self._prune() 166 | 167 | def _get_raw_leaf_data(self, X): 168 | """Returns the raw predicted leaf data. 169 | 170 | For both classification and regression models, the following data is returned for each row of X: 171 | [[prior], [posterior]]. This method directly accesses implementation details and should therefore 172 | be used with caution. 173 | 174 | Parameters 175 | ---------- 176 | X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features] 177 | The input samples. 178 | 179 | Returns 180 | ------- 181 | y : array of shape = [n_samples * 2 * n_classes] for classification problems 182 | or [n_samples * 2 * 4] for regression problems. 183 | """ 184 | 185 | # input transformation and checks 186 | X, _ = self._normalize_data_and_feature_names(X) 187 | self._ensure_is_fitted(X) 188 | 189 | indices = np.arange(X.shape[0]) 190 | raw_leaf_data = [None] * X.shape[0] 191 | self._predict(X, indices, None, raw_leaf_data) 192 | 193 | return np.array(raw_leaf_data) 194 | 195 | @abstractmethod 196 | def _update_feature_importance(self, feature_importance): 197 | raise NotImplementedError 198 | 199 | @staticmethod 200 | def _normalize_data_and_feature_names(X, feature_names=None): 201 | if isinstance(X, pd.DataFrame): 202 | if feature_names is None: 203 | feature_names = X.columns 204 | 205 | X = X.values 206 | else: 207 | if isinstance(X, list): 208 | X = np.array(X) 209 | elif np.isscalar(X): 210 | X = np.array([X]) 211 | 212 | if X.ndim == 1: 213 | X = np.expand_dims(X, 0) 214 | 215 | if feature_names is None: 216 | feature_names = ['x{}'.format(i) for i in range(X.shape[1])] 217 | 218 | X = BaseTree._ensure_float64(X) 219 | 220 | if X.ndim != 2: 221 | raise ValueError('X should have 2 dimensions but has {}'.format(X.ndim)) 222 | 223 | return X, feature_names 224 | 225 | @staticmethod 226 | def _ensure_float64(data): 227 | if data.dtype in ( 228 | np.int8, np.int16, np.int32, np.int64, 229 | np.uint8, np.uint16, np.uint32, np.uint64, 230 | np.float32, np.float64): 231 | return data 232 | 233 | # convert to np.float64 for performance reasons (matrices with floats but of type object are very slow) 234 | X_float = data.astype(np.float64) 235 | if not np.all(data == X_float): 236 | raise ValueError('Cannot convert data matrix to np.float64 without loss of precision. Please check your data.') 237 | 238 | return X_float 239 | 240 | def _ensure_is_fitted(self, X=None): 241 | if not self.is_fitted(): 242 | raise ValueError('Cannot predict on an untrained model; call .fit() first') 243 | 244 | if X is not None and X.shape[1] != self.n_dim_: 245 | raise ValueError('Bad input dimensions: Expected {}, got {}'.format(self.n_dim_, X.shape[1])) 246 | 247 | def is_fitted(self): 248 | return hasattr(self, 'posterior_') 249 | 250 | def get_depth(self): 251 | """Computes and returns the tree depth. 252 | 253 | Returns 254 | ------- 255 | depth : int 256 | The tree depth. 257 | """ 258 | 259 | return self._update_depth(0) 260 | 261 | def get_n_leaves(self): 262 | """Computes and returns the total number of leaves of this tree. 263 | 264 | Returns 265 | ------- 266 | n_leaves : int 267 | The number of leaves. 268 | """ 269 | 270 | return self._update_n_leaves(0) 271 | 272 | def _update_depth(self, depth): 273 | if self.is_leaf(): 274 | return max(depth, self.level) 275 | else: 276 | if self.child1_ is not None: 277 | depth = self.child1_._update_depth(depth) 278 | depth = self.child2_._update_depth(depth) 279 | 280 | return depth 281 | 282 | def _update_n_leaves(self, n_leaves): 283 | if self.is_leaf(): 284 | return n_leaves+1 285 | else: 286 | if self.child1_ is not None: 287 | n_leaves = self.child1_._update_n_leaves(n_leaves) 288 | n_leaves = self.child2_._update_n_leaves(n_leaves) 289 | 290 | return n_leaves 291 | 292 | def _erase_split_info_base(self): 293 | self.child1_ = None 294 | self.child2_ = None 295 | self.log_p_data_no_split_ = None 296 | self.best_log_p_data_split_ = None 297 | 298 | @abstractmethod 299 | def _get_prior(self, n_data, n_dim): 300 | raise NotImplementedError 301 | 302 | @abstractmethod 303 | def _erase_split_info(self): 304 | raise NotImplementedError 305 | 306 | @abstractmethod 307 | def is_leaf(self): 308 | raise NotImplementedError 309 | 310 | @abstractmethod 311 | def _check_target(self, y): 312 | raise NotImplementedError 313 | 314 | @abstractmethod 315 | def _compute_log_p_data_no_split(self, y, prior): 316 | raise NotImplementedError 317 | 318 | @abstractmethod 319 | def _compute_log_p_data_split(self, y, prior, n_dim, split_indices): 320 | raise NotImplementedError 321 | 322 | @abstractmethod 323 | def _compute_posterior(self, y, prior, delta=1): 324 | raise NotImplementedError 325 | 326 | @abstractmethod 327 | def _compute_posterior_mean(self): 328 | raise NotImplementedError 329 | 330 | @abstractmethod 331 | def _predict_leaf(self): 332 | raise NotImplementedError 333 | 334 | @abstractmethod 335 | def _get_raw_leaf_data_internal(self): 336 | raise NotImplementedError 337 | 338 | @abstractmethod 339 | def _fit(self, X, y, verbose, feature_names, side_name): 340 | raise NotImplementedError 341 | 342 | def __repr__(self): 343 | return self.__str__() 344 | 345 | @abstractmethod 346 | def _compute_child1_and_child2_indices(self, X, indices, dense): 347 | raise NotImplementedError 348 | -------------------------------------------------------------------------------- /bayesian_decision_tree/base_hyperplane.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC 3 | from scipy.optimize._differentialevolution import DifferentialEvolutionSolver 4 | from scipy.sparse import csc_matrix, csr_matrix 5 | 6 | from bayesian_decision_tree.base import BaseTree 7 | from bayesian_decision_tree.hyperplane_optimization import HyperplaneOptimizationFunction, ScipyOptimizer 8 | 9 | 10 | class BaseHyperplaneTree(BaseTree, ABC): 11 | """ 12 | Abstract base class of all Bayesian decision tree models using arbitrarily-oriented hyperplane splits 13 | (classification and regression). Performs medium-level fitting and prediction tasks and outsources 14 | the low-level work to subclasses. 15 | """ 16 | 17 | def __init__(self, partition_prior, prior, delta, prune, child_type, is_regression, optimizer, split_precision, level): 18 | BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level) 19 | 20 | self.optimizer = optimizer 21 | 22 | def _fit(self, X, y, verbose, feature_names, side_name): 23 | n_data = X.shape[0] 24 | n_dim = X.shape[1] 25 | prior = self._get_prior(n_data, n_dim) 26 | 27 | if verbose: 28 | name = 'level {} {}'.format(self.level, side_name) 29 | print('Training {} with {:10} data points'.format(name, n_data)) 30 | 31 | dense = isinstance(X, np.ndarray) 32 | if not dense and isinstance(X, csr_matrix): 33 | # column accesses coming up, so convert to CSC sparse matrix format 34 | X = csc_matrix(X) 35 | 36 | log_p_data_no_split = self._compute_log_p_data_no_split(y, prior) 37 | 38 | optimizer = self.optimizer 39 | if optimizer is None: 40 | # default to 'Differential Evolution' which works well and is reasonably fast 41 | optimizer = ScipyOptimizer(DifferentialEvolutionSolver, 666) 42 | 43 | # the function to optimize (depends on X and y, hence we need to instantiate it for every data set anew) 44 | optimization_function = HyperplaneOptimizationFunction( 45 | X, 46 | y, 47 | prior, 48 | self._compute_log_p_data_split, 49 | log_p_data_no_split, 50 | optimizer.search_space_is_unit_hypercube, 51 | self.split_precision) 52 | 53 | # create and run optimizer 54 | optimizer.solve(optimization_function) 55 | 56 | self.optimization_function = optimization_function 57 | 58 | # retrieve best hyperplane split from optimization function 59 | self._erase_split_info_base() 60 | self._erase_split_info() 61 | if optimization_function.best_hyperplane_normal is not None: 62 | # split data and target to recursively train children 63 | projections = X @ optimization_function.best_hyperplane_normal \ 64 | - np.dot(optimization_function.best_hyperplane_normal, optimization_function.best_hyperplane_origin) 65 | indices1 = np.where(projections < 0)[0] 66 | indices2 = np.where(projections >= 0)[0] 67 | 68 | if len(indices1) > 0 and len(indices2) > 0: 69 | """ 70 | Note: The reason why indices1 or indices2 could be empty is that the optimizer might find a 71 | 'split' that puts all data one one side and nothing on the other side, and that 'split' has 72 | a higher log probability than 'log_p_data_no_split' because of the partition prior 73 | overwhelming the data likelihoods (which are of course identical between the 'all data' and 74 | the 'everything on one side split' scenarios)s. 75 | """ 76 | X1 = X[indices1] 77 | X2 = X[indices2] 78 | y1 = y[indices1] 79 | y2 = y[indices2] 80 | 81 | n_data1 = X1.shape[0] 82 | n_data2 = X2.shape[0] 83 | 84 | # compute posteriors of children and priors for further splitting 85 | prior_child1 = self._compute_posterior(y1, prior, delta=0) 86 | prior_child2 = self._compute_posterior(y2, prior, delta=0) 87 | 88 | # store split info, create children and continue training them if there's data left to split 89 | self.best_hyperplane_normal_ = optimization_function.best_hyperplane_normal 90 | self.best_hyperplane_origin_ = optimization_function.best_hyperplane_origin 91 | 92 | self.log_p_data_no_split_ = optimization_function.log_p_data_no_split 93 | self.best_log_p_data_split_ = optimization_function.best_log_p_data_split 94 | 95 | self.child1_ = self.child_type(self.partition_prior, prior_child1, self.delta, 96 | self.prune, optimizer, self.split_precision, self.level+1) 97 | self.child2_ = self.child_type(self.partition_prior, prior_child2, self.delta, 98 | self.prune, optimizer, self.split_precision, self.level+1) 99 | self.child1_._erase_split_info_base() 100 | self.child2_._erase_split_info_base() 101 | self.child1_._erase_split_info() 102 | self.child2_._erase_split_info() 103 | 104 | # fit children if there is more than one data point (i.e., there is 105 | # something to split) and if the targets differ (no point otherwise) 106 | if n_data1 > 1 and len(np.unique(y1)) > 1: 107 | self.child1_._fit(X1, y1, verbose, feature_names, 'back ') 108 | else: 109 | self.child1_.posterior_ = self._compute_posterior(y1, prior) 110 | self.child1_.n_data_ = n_data1 111 | 112 | if n_data2 > 1 and len(np.unique(y2)) > 1: 113 | self.child2_._fit(X2, y2, verbose, feature_names, 'front') 114 | else: 115 | self.child2_.posterior_ = self._compute_posterior(y2, prior) 116 | self.child2_.n_data_ = n_data2 117 | 118 | # compute posterior 119 | self.n_dim_ = X.shape[1] 120 | self.n_data_ = n_data 121 | self.posterior_ = self._compute_posterior(y, prior) 122 | 123 | def _compute_child1_and_child2_indices(self, X, indices, dense): 124 | projections = X[indices] @ self.best_hyperplane_normal_ - np.dot(self.best_hyperplane_normal_, self.best_hyperplane_origin_) 125 | indices1 = np.where(projections < 0)[0] 126 | indices2 = np.where(projections >= 0)[0] 127 | 128 | return indices1, indices2 129 | 130 | def is_leaf(self): 131 | self._ensure_is_fitted() 132 | return self.best_hyperplane_normal_ is None 133 | 134 | def feature_importance(self): 135 | self._ensure_is_fitted() 136 | 137 | feature_importance = np.zeros(self.n_dim_) 138 | self._update_feature_importance(feature_importance) 139 | feature_importance /= feature_importance.sum() 140 | 141 | return feature_importance 142 | 143 | def _update_feature_importance(self, feature_importance): 144 | if self.is_leaf(): 145 | return 146 | else: 147 | log_p_gain = self.best_log_p_data_split_ - self.log_p_data_no_split_ 148 | hyperplane_normal = self.best_hyperplane_normal_ 149 | 150 | # the more the normal vector is oriented along a given dimension's axis the more 151 | # important that dimension is, so weight log_p_gain with hyperplane_normal[i_dim] 152 | # (its absolute value in fact because the sign of the direction is irrelevant) 153 | feature_importance += log_p_gain * np.abs(hyperplane_normal) 154 | if self.child1_ is not None: 155 | self.child1_._update_feature_importance(feature_importance) 156 | self.child2_._update_feature_importance(feature_importance) 157 | 158 | def _erase_split_info(self): 159 | self.best_hyperplane_normal_ = None 160 | self.best_hyperplane_origin_ = None 161 | 162 | def __str__(self): 163 | if not self.is_fitted(): 164 | return 'Unfitted model' 165 | 166 | return self._str([], '\u251C', '\u2514', '\u2502', '\u2265', None) 167 | 168 | def _str(self, anchor, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, is_back_child): 169 | anchor_str = ''.join(' ' + a for a in anchor) 170 | s = '' 171 | if is_back_child is not None: 172 | s += anchor_str + ' {:5s}: '.format('back' if is_back_child else 'front') 173 | 174 | if self.is_leaf(): 175 | s += 'y={}, n={}'.format(self._predict_leaf(), self.n_data_) 176 | if not self.is_regression: 177 | s += ', p(y)={}'.format(self._compute_posterior_mean()) 178 | else: 179 | s += 'HP(origin={}, normal={})'.format(self.best_hyperplane_origin_, self.best_hyperplane_normal_) 180 | 181 | # 'back' child (the child that is on the side of the hyperplane opposite to the normal vector, or projection < 0) 182 | s += '\n' 183 | anchor_child1 = [VERT_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_back_child else ' '), VERT_RIGHT]) 184 | s += self.child1_._str(anchor_child1, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, True) 185 | 186 | # 'front' child (the child that is on same side of the hyperplane as the normal vector, or projection >= 0) 187 | s += '\n' 188 | anchor_child2 = [DOWN_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_back_child else ' '), DOWN_RIGHT]) 189 | s += self.child2_._str(anchor_child2, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, False) 190 | return s 191 | -------------------------------------------------------------------------------- /bayesian_decision_tree/base_perpendicular.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC 3 | from scipy.sparse import csr_matrix, csc_matrix 4 | 5 | from bayesian_decision_tree.base import BaseTree 6 | 7 | 8 | class BasePerpendicularTree(BaseTree, ABC): 9 | """ 10 | Abstract base class of all Bayesian tree models using splits perpendicular to a single feature axis 11 | (classification and regression). Performs medium-level fitting and prediction tasks and outsources 12 | the low-level work to subclasses. 13 | """ 14 | 15 | def __init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level): 16 | BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, is_regression, split_precision, level) 17 | 18 | def prediction_paths(self, X): 19 | """Returns the prediction paths for X. 20 | 21 | Parameters 22 | ---------- 23 | X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, 24 | shape = [n_samples, n_features] 25 | 26 | The input samples. 27 | 28 | Returns 29 | ------- 30 | prediction_paths : array-like, shape = [n_samples, 4] 31 | 32 | The prediction paths, each row containing the following fields: 33 | split dimension, split feature name, split value, True if greater than the split value and False otherwise 34 | """ 35 | 36 | # input transformation and checks 37 | X, _ = self._normalize_data_and_feature_names(X) 38 | self._ensure_is_fitted(X) 39 | 40 | paths = [[] for i in range(X.shape[0])] 41 | self._update_prediction_paths(X, np.arange(X.shape[0]), paths) 42 | 43 | return paths 44 | 45 | def _update_prediction_paths(self, X, indices, paths): 46 | if not self.is_leaf(): 47 | dense = isinstance(X, np.ndarray) 48 | if not dense and isinstance(X, csr_matrix): 49 | # column accesses coming up, so convert to CSC sparse matrix format 50 | X = csc_matrix(X) 51 | 52 | indices1, indices2 = self._compute_child1_and_child2_indices(X, indices, dense) 53 | 54 | if len(indices1) > 0: 55 | step = (self.split_dimension_, self.split_feature_name_, self.split_value_, False) 56 | for i in indices1: 57 | paths[i].append(step) 58 | 59 | if len(indices2) > 0: 60 | step = (self.split_dimension_, self.split_feature_name_, self.split_value_, True) 61 | for i in indices2: 62 | paths[i].append(step) 63 | 64 | if len(indices1) > 0 and not self.child1_.is_leaf(): 65 | paths1 = [paths[i] for i in indices1] 66 | self.child1_._update_prediction_paths(X, indices1, paths1) 67 | 68 | if len(indices2) > 0 and not self.child2_.is_leaf(): 69 | paths2 = [paths[i] for i in indices2] 70 | self.child2_._update_prediction_paths(X, indices2, paths2) 71 | 72 | @staticmethod 73 | def _create_merged_paths_array(n_rows): 74 | return np.zeros((n_rows, 4)) 75 | 76 | def _fit(self, X, y, verbose, feature_names, side_name, sort_indices_by_dim=None): 77 | n_data = sort_indices_by_dim.shape[1] if sort_indices_by_dim is not None else X.shape[0] 78 | 79 | if verbose: 80 | name = 'level {} {}'.format(self.level, side_name) 81 | print('Training {} with {:10} data points'.format(name, n_data)) 82 | 83 | dense = isinstance(X, np.ndarray) 84 | if not dense and isinstance(X, csr_matrix): 85 | # column accesses coming up, so convert to CSC sparse matrix format 86 | X = csc_matrix(X) 87 | 88 | n_dim = X.shape[1] 89 | 90 | # compute sort indices (only done once at the start) 91 | if sort_indices_by_dim is None: 92 | dtype = np.uint16 if n_data < (1 << 16) else np.uint32 if n_data < (1 << 32) else np.uint64 93 | sort_indices_by_dim = np.zeros(X.shape[::-1], dtype=dtype) 94 | for dim in range(n_dim): 95 | X_dim = X[:, dim] 96 | if not dense: 97 | X_dim = self._to_array(X_dim) 98 | 99 | sort_indices_by_dim[dim] = np.argsort(X_dim) 100 | 101 | # compute data likelihood of not splitting and remember it as the best option so far 102 | prior = self._get_prior(n_data, n_dim) 103 | y_any = y[sort_indices_by_dim[0]] # any dim works as the order doesn't matter 104 | log_p_data_no_split = self._compute_log_p_data_no_split(y_any, prior) 105 | best_log_p_data_split = log_p_data_no_split 106 | 107 | # compute data likelihoods of all possible splits along all data dimensions 108 | best_split_index = -1 109 | best_split_dimension = -1 110 | for dim in range(n_dim): 111 | sort_indices = sort_indices_by_dim[dim] 112 | X_dim_sorted = X[sort_indices, dim] 113 | if not dense: 114 | X_dim_sorted = self._to_array(X_dim_sorted) 115 | 116 | split_indices = 1 + np.where(np.abs(np.diff(X_dim_sorted)) > self.split_precision)[0] # we can only split between *different* data points 117 | if len(split_indices) == 0: 118 | # no split possible along this dimension 119 | continue 120 | 121 | y_sorted = y[sort_indices] 122 | 123 | # compute data likelihoods of all possible splits along this dimension and find split with highest data likelihood 124 | log_p_data_split = self._compute_log_p_data_split(y_sorted, prior, n_dim, split_indices) 125 | i_max = log_p_data_split.argmax() 126 | if log_p_data_split[i_max] > best_log_p_data_split: 127 | # remember new best split 128 | best_log_p_data_split = log_p_data_split[i_max] 129 | best_split_index = split_indices[i_max] # data index of best split 130 | best_split_dimension = dim 131 | 132 | # did we find a split that has a higher likelihood than the no-split likelihood? 133 | if best_split_index > 0: 134 | # split data and target to recursively train children 135 | indices1 = sort_indices_by_dim[best_split_dimension, :best_split_index] 136 | indices2 = sort_indices_by_dim[best_split_dimension, best_split_index:] 137 | 138 | # compute posteriors of children and priors for further splitting 139 | prior = self._get_prior(n_data, n_dim) 140 | prior_child1 = tuple(self._compute_posterior(y[indices1], prior, self.delta)) if self.delta != 0 else prior 141 | prior_child2 = tuple(self._compute_posterior(y[indices2], prior, self.delta)) if self.delta != 0 else prior 142 | 143 | # store split info, create children and continue training them if there's data left to split 144 | self.split_dimension_ = best_split_dimension 145 | self.split_feature_name_ = feature_names[best_split_dimension] 146 | self.split_value_ = 0.5 * ( 147 | X[indices1[-1], best_split_dimension] 148 | + X[indices2[0], best_split_dimension] 149 | ) 150 | self.log_p_data_no_split_ = log_p_data_no_split 151 | self.best_log_p_data_split_ = best_log_p_data_split 152 | 153 | self.child1_ = self.child_type(self.partition_prior, prior_child1, self.delta, 154 | self.prune, self.split_precision, self.level+1) 155 | self.child2_ = self.child_type(self.partition_prior, prior_child2, self.delta, 156 | self.prune, self.split_precision, self.level+1) 157 | self.child1_._erase_split_info_base() 158 | self.child2_._erase_split_info_base() 159 | self.child1_._erase_split_info() 160 | self.child2_._erase_split_info() 161 | 162 | # fit children if there is more than one data point (i.e., there is 163 | # something to split) and if the targets differ (no point otherwise) 164 | sort_indices_by_dim_1 = sort_indices_by_dim[np.isin(sort_indices_by_dim, indices1)].reshape(n_dim, -1) 165 | sort_indices_by_dim_2 = sort_indices_by_dim[np.isin(sort_indices_by_dim, indices2)].reshape(n_dim, -1) 166 | del sort_indices_by_dim # help GC maybe? 167 | n_data1 = sort_indices_by_dim_1.shape[1] 168 | n_data2 = sort_indices_by_dim_2.shape[1] 169 | y1 = y[indices1] 170 | if n_data1 > 1 and len(np.unique(y1)) > 1: 171 | self.child1_._fit(X, y, verbose, feature_names, 'LHS', sort_indices_by_dim_1) 172 | else: 173 | self.child1_.posterior_ = self._compute_posterior(y1, prior) 174 | self.child1_.n_data_ = n_data1 175 | 176 | y2 = y[indices2] 177 | if n_data2 > 1 and len(np.unique(y2)) > 1: 178 | self.child2_._fit(X, y, verbose, feature_names, 'RHS', sort_indices_by_dim_2) 179 | else: 180 | self.child2_.posterior_ = self._compute_posterior(y2, prior) 181 | self.child2_.n_data_ = n_data2 182 | else: 183 | self._erase_split_info_base() 184 | self._erase_split_info() 185 | 186 | # compute posterior 187 | self.n_dim_ = n_dim 188 | self.n_data_ = n_data 189 | self.posterior_ = self._compute_posterior(y_any, prior) # any dim works as the order doesn't matter 190 | 191 | def _compute_child1_and_child2_indices(self, X, indices, dense): 192 | X_split = X[indices, self.split_dimension_] 193 | if not dense: 194 | X_split = self._to_array(X_split) 195 | 196 | indices1 = np.where(X_split < self.split_value_)[0] 197 | indices2 = np.where(X_split >= self.split_value_)[0] 198 | 199 | return indices1, indices2 200 | 201 | def is_leaf(self): 202 | self._ensure_is_fitted() 203 | return self.split_value_ is None 204 | 205 | def _update_feature_importance(self, feature_importance): 206 | if self.is_leaf(): 207 | return 208 | else: 209 | log_p_gain = self.best_log_p_data_split_ - self.log_p_data_no_split_ 210 | feature_importance[self.split_dimension_] += log_p_gain 211 | if self.child1_ is not None: 212 | self.child1_._update_feature_importance(feature_importance) 213 | self.child2_._update_feature_importance(feature_importance) 214 | 215 | def _erase_split_info(self): 216 | self.split_dimension_ = -1 217 | self.split_value_ = None 218 | self.split_feature_name_ = None 219 | 220 | @staticmethod 221 | def _to_array(sparse_array): 222 | array = sparse_array.toarray() 223 | return array[0] if array.shape == (1, 1) else array.squeeze() 224 | 225 | def __str__(self): 226 | if not self.is_fitted(): 227 | return 'Unfitted model' 228 | 229 | return self._str([], self.split_value_, '\u251C', '\u2514', '\u2502', '\u2265', None) 230 | 231 | def _str(self, anchor, parent_split_value, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, is_left_child): 232 | anchor_str = ''.join(' ' + a for a in anchor) 233 | s = '' 234 | if is_left_child is not None: 235 | s += anchor_str + ' {}{}: '.format('<' if is_left_child else GEQ, parent_split_value) 236 | 237 | if self.is_leaf(): 238 | s += 'y={}, n={}'.format(self._predict_leaf(), self.n_data_) 239 | if not self.is_regression: 240 | s += ', p(y)={}'.format(self._compute_posterior_mean()) 241 | else: 242 | s += '{}={}'.format(self.split_feature_name_, self.split_value_) 243 | 244 | s += '\n' 245 | anchor_child1 = [VERT_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_left_child else ' '), VERT_RIGHT]) 246 | s += self.child1_._str(anchor_child1, self.split_value_, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, True) 247 | 248 | s += '\n' 249 | anchor_child2 = [DOWN_RIGHT] if len(anchor) == 0 else (anchor[:-1] + [(BAR if is_left_child else ' '), DOWN_RIGHT]) 250 | s += self.child2_._str(anchor_child2, self.split_value_, VERT_RIGHT, DOWN_RIGHT, BAR, GEQ, False) 251 | return s 252 | -------------------------------------------------------------------------------- /bayesian_decision_tree/classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module declares the Bayesian classification tree models: 3 | * PerpendicularClassificationTree 4 | * HyperplaneClassificationTree 5 | """ 6 | import numpy as np 7 | from abc import ABC 8 | from sklearn.base import ClassifierMixin 9 | 10 | from bayesian_decision_tree.base import BaseTree 11 | from bayesian_decision_tree.base_hyperplane import BaseHyperplaneTree 12 | from bayesian_decision_tree.base_perpendicular import BasePerpendicularTree 13 | from bayesian_decision_tree.utils import multivariate_betaln 14 | 15 | 16 | class BaseClassificationTree(BaseTree, ABC, ClassifierMixin): 17 | """ 18 | Abstract base class of all Bayesian classification trees (perpendicular and hyperplane). Performs 19 | medium-level fitting and prediction tasks and outsources the low-level work to subclasses. 20 | """ 21 | 22 | def __init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level=0): 23 | BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, split_precision, level) 24 | 25 | def predict_proba(self, X): 26 | """Predict class probabilities of the input samples X. 27 | 28 | Parameters 29 | ---------- 30 | X : array-like, scipy.sparse.csc_matrix, scipy.sparse.csr_matrix or pandas.DataFrame, shape = [n_samples, n_features] 31 | The input samples. 32 | 33 | Returns 34 | ------- 35 | p : array of shape = [n_samples, n_classes] 36 | The class probabilities of the input samples. 37 | """ 38 | 39 | # input transformation and checks 40 | X, _ = self._normalize_data_and_feature_names(X) 41 | self._ensure_is_fitted(X) 42 | 43 | y_proba = [None] * X.shape[0] 44 | self._predict(X, np.arange(X.shape[0]), False, y_proba) 45 | 46 | return np.array(y_proba) 47 | 48 | def _check_target(self, y): 49 | if y.ndim != 1: 50 | raise ValueError('y should have 1 dimension but has {}'.format(y.ndim)) 51 | 52 | n_classes = len(self.prior) 53 | if not np.all(np.unique(y) == np.arange(0, n_classes)): 54 | raise ValueError('Expected target values 0..{} but found {}..{}'.format(n_classes - 1, y.min(), y.max())) 55 | 56 | def _get_prior(self, n_data, n_dim): 57 | if self.prior is not None: 58 | return self.prior 59 | else: 60 | prior_pseudo_observation_count = max(1, n_data//100) 61 | return prior_pseudo_observation_count * np.ones(n_dim) 62 | 63 | def _compute_log_p_data_no_split(self, y, prior): 64 | posterior = self._compute_posterior(y, prior) 65 | 66 | log_p_prior = np.log(1-self.partition_prior**(1+self.level)) 67 | log_p_data = multivariate_betaln(posterior) - multivariate_betaln(prior) 68 | 69 | return log_p_prior + log_p_data 70 | 71 | def _compute_log_p_data_split(self, y, prior, n_dim, split_indices): 72 | n_classes = len(prior) 73 | k1 = np.empty(n_classes, dtype=object) 74 | k2 = np.empty(n_classes, dtype=object) 75 | for i in range(n_classes): 76 | k1_and_total = (y == i).cumsum() 77 | total = k1_and_total[-1] 78 | k1[i] = k1_and_total[split_indices-1] 79 | k2[i] = total - k1[i] 80 | 81 | n_splits = len(split_indices) 82 | log_p_prior = np.log(self.partition_prior**(1+self.level) / (n_splits * n_dim)) 83 | 84 | betaln_prior = multivariate_betaln(prior) 85 | log_p_data1 = self._compute_log_p_data(k1, prior, betaln_prior) 86 | log_p_data2 = self._compute_log_p_data(k2, prior, betaln_prior) 87 | 88 | return log_p_prior + log_p_data1 + log_p_data2 89 | 90 | def _compute_posterior(self, y, prior, delta=1): 91 | if delta == 0: 92 | return prior 93 | 94 | # see https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions 95 | y_reshaped = np.broadcast_to(y, (len(prior), len(y))) 96 | classes = np.arange(len(prior)).reshape(-1, 1) 97 | k = np.sum(y_reshaped == classes, axis=1) 98 | posterior = prior + delta*k 99 | 100 | return posterior 101 | 102 | def _compute_posterior_mean(self): 103 | return self.posterior_ / np.sum(self.posterior_) 104 | 105 | def _compute_log_p_data(self, k, prior, betaln_prior): 106 | # see https://www.cs.ubc.ca/~murphyk/Teaching/CS340-Fall06/reading/bernoulli.pdf, equation (42) 107 | # which can be expressed as a fraction of beta functions 108 | return multivariate_betaln(prior+k) - betaln_prior 109 | 110 | def _predict_leaf(self): 111 | # predict class 112 | return np.argmax(self.posterior_) 113 | 114 | def _get_raw_leaf_data_internal(self): 115 | # prior and posterior raw data 116 | return np.array([self.prior, self.posterior_]) 117 | 118 | 119 | class PerpendicularClassificationTree(BasePerpendicularTree, BaseClassificationTree): 120 | """ 121 | Bayesian binary or multiclass classification tree. Uses a Dirichlet prior (a 122 | multivariate generalization of the Beta prior for more than 2 variables). 123 | 124 | Parameters 125 | ---------- 126 | partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9 127 | The prior probability of splitting a node's data into two children. 128 | 129 | Small values tend to reduce the tree depth, leading to less expressiveness 130 | but also to less overfitting. 131 | 132 | Large values tend to increase the tree depth and thus lead to the tree 133 | better fitting the data, which can lead to overfitting. 134 | 135 | prior : array_like, shape = [number of classes] 136 | The hyperparameters [alpha_0, alpha_1, ..., alpha_{N-1}] of the Dirichlet 137 | conjugate prior, see [1] and [2]. All alpha_i must be positive, where 138 | alpha_i represents the number of prior pseudo-observations of class i. 139 | 140 | Small values for alpha_i represent a weak prior which leads to the 141 | training data dominating the posterior. This can lead to overfitting. 142 | 143 | Large values for alpha_i represent a strong prior and thus put less weight 144 | on the data. This can be used for regularization. 145 | 146 | delta : float, default=0.0 147 | Determines the strengthening of the prior as the tree grows deeper, 148 | see [1]. Must be a value between 0.0 and 1.0. 149 | 150 | prune : boolean, default=False 151 | Prunes the tree after fitting if `True` by removing all splits that don't add information, 152 | i.e., where the predictions of both children are identical. It's usually sensible to set 153 | this to `True` in the classification case if you're only interested in class predictions 154 | (`predict(X)`), but it makes sense to set it to `False` if you're looking for class 155 | probabilities (`predict_proba(X)`). It can safely be set to 'True' in the regression case 156 | because it will only merge children if their predictions are identical. 157 | 158 | split_precision : float, default=0.0 159 | Determines the minimum distance between two contiguous points to consider a split. If the distance is below 160 | this threshold, the points are considered to overlap along this direction. 161 | 162 | level : DO NOT SET, ONLY USED BY SUBCLASSES 163 | 164 | See also 165 | -------- 166 | demo_classification_perpendicular.py 167 | PerpendicularRegressionTree 168 | HyperplaneClassificationTree 169 | 170 | References 171 | ---------- 172 | 173 | .. [1] https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical/multinomial 174 | 175 | .. [2] https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions 176 | 177 | Examples 178 | -------- 179 | See `demo_classification_perpendicular.py`. 180 | """ 181 | 182 | def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, split_precision=0.0, level=0): 183 | child_type = PerpendicularClassificationTree 184 | BasePerpendicularTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, split_precision, level) 185 | BaseClassificationTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level) 186 | 187 | 188 | class HyperplaneClassificationTree(BaseHyperplaneTree, BaseClassificationTree): 189 | """ 190 | Bayesian binary or multiclass classification tree using arbitrarily-oriented 191 | hyperplane splits. Uses a Dirichlet prior (a multivariate generalization 192 | of the Beta prior for more than 2 variables). 193 | 194 | Parameters 195 | ---------- 196 | partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9 197 | The prior probability of splitting a node's data into two children. 198 | 199 | Small values tend to reduce the tree depth, leading to less expressiveness 200 | but also to less overfitting. 201 | 202 | Large values tend to increase the tree depth and thus lead to the tree 203 | better fitting the data, which can lead to overfitting. 204 | 205 | prior : array_like, shape = [number of classes] 206 | The hyperparameters [alpha_0, alpha_1, ..., alpha_{N-1}] of the Dirichlet 207 | conjugate prior, see [1] and [2]. All alpha_i must be positive, where 208 | alpha_i represents the number of prior pseudo-observations of class i. 209 | 210 | Small values for alpha_i represent a weak prior which leads to the 211 | training data dominating the posterior. This can lead to overfitting. 212 | 213 | Large values for alpha_i represent a strong prior and thus put less weight 214 | on the data. This can be used for regularization. 215 | 216 | delta : float, default=0.0 217 | Determines the strengthening of the prior as the tree grows deeper, 218 | see [1]. Must be a value between 0.0 and 1.0. 219 | 220 | prune : boolean, default=False 221 | Prunes the tree after fitting if `True` by removing all splits that don't add information, 222 | i.e., where the predictions of both children are identical. It's usually sensible to set 223 | this to `True` in the classification case if you're only interested in class predictions 224 | (`predict(X)`), but it makes sense to set it to `False` if you're looking for class 225 | probabilities (`predict_proba(X)`). It can safely be set to 'True' in the regression case 226 | because it will only merge children if their predictions are identical. 227 | 228 | optimizer : object 229 | A global optimization algorithm object that performs optimal hyperparameter 230 | orientation search. The available options are (in the order in which you should 231 | try them): 232 | - ScipyOptimizer: A wrapper around scipy global optimizers. See usages for examples. 233 | - SimulatedAnnealingOptimizer: Experimental, but works well with n_scan=20, n_keep=10, spread_factor=0.95 234 | - RandomHyperplaneOptimizer: Experimental, mediocre performance 235 | - RandomTwoPointOptimizer: Experimental, mediocre performance 236 | - GradientDescentOptimizer: Experimental, mediocre performance 237 | 238 | split_precision : float, default=0.0 239 | Determines the minimum distance between two contiguous points to consider a split. If the distance is below 240 | this threshold, the points are considered to overlap along this direction. 241 | 242 | level : DO NOT SET, ONLY USED BY SUBCLASSES 243 | 244 | See also 245 | -------- 246 | demo_classification_hyperplane.py 247 | HyperplaneRegressionTree 248 | PerpendicularClassificationTree 249 | 250 | References 251 | ---------- 252 | 253 | .. [1] https://en.wikipedia.org/wiki/Dirichlet_distribution#Conjugate_to_categorical/multinomial 254 | 255 | .. [2] https://en.wikipedia.org/wiki/Conjugate_prior#Discrete_distributions 256 | 257 | Examples 258 | -------- 259 | See `demo_classification_perpendicular.py`. 260 | """ 261 | 262 | def __init__(self, partition_prior=0.99, prior=None, delta=None, prune=False, optimizer=None, split_precision=0.0, level=0): 263 | child_type = HyperplaneClassificationTree 264 | BaseHyperplaneTree.__init__(self, partition_prior, prior, delta, prune, child_type, False, optimizer, split_precision, level) 265 | BaseClassificationTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level) 266 | -------------------------------------------------------------------------------- /bayesian_decision_tree/hyperplane_optimization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | from numpy.random import RandomState 4 | from scipy.sparse import csr_matrix, csc_matrix 5 | 6 | from bayesian_decision_tree.utils import r2_series_generator, hypercube_to_hypersphere_surface 7 | 8 | 9 | class HyperplaneOptimizationFunction: 10 | """ 11 | The function to optimize for hyperplane trees. This is a function of `n_dim` variables representing 12 | the normal vector of a hyperplane in `n_dim` dimensions. Given such a hyperplane normal the function 13 | computes the optimum split location (i.e., the origin of the hyperplane) in the data such that the 14 | data likelihood is maximized. 15 | """ 16 | 17 | def __init__(self, X, y, prior, compute_log_p_data_split, log_p_data_no_split, search_space_is_unit_hypercube, split_precision): 18 | self.X = X 19 | self.y = y 20 | self.prior = prior 21 | self.compute_log_p_data_split = compute_log_p_data_split 22 | self.log_p_data_no_split = log_p_data_no_split 23 | self.search_space_is_unit_hypercube = search_space_is_unit_hypercube 24 | self.split_precision = split_precision 25 | 26 | # results of the optimization - to be set later during the actual optimization 27 | self.function_evaluations = 0 28 | self.best_log_p_data_split = log_p_data_no_split 29 | self.best_cumulative_distances = 0 30 | self.best_hyperplane_normal = None 31 | self.best_hyperplane_origin = None 32 | 33 | def compute(self, hyperplane_normal): 34 | self.function_evaluations += 1 35 | 36 | if self.search_space_is_unit_hypercube: 37 | hyperplane_normal = hypercube_to_hypersphere_surface(hyperplane_normal, half_hypersphere=True) 38 | 39 | # catch some special cases and normalize to unit length 40 | hyperplane_normal = np.nan_to_num(hyperplane_normal) 41 | if np.all(hyperplane_normal == 0): 42 | hyperplane_normal[0] = 1 43 | 44 | hyperplane_normal /= np.linalg.norm(hyperplane_normal) 45 | 46 | dense = isinstance(self.X, np.ndarray) 47 | if not dense and isinstance(self.X, csr_matrix): 48 | self.X = csc_matrix(self.X) 49 | 50 | # compute distance of all points to the hyperplane: https://mathinsight.org/distance_point_plane 51 | projections = self.X @ hyperplane_normal # up to an additive constant which doesn't matter to distance ordering 52 | sort_indices = np.argsort(projections) 53 | split_indices = 1 + np.where(np.abs(np.diff(projections)) > self.split_precision)[0] # we can only split between *different* data points 54 | if len(split_indices) == 0: 55 | # no split possible along this dimension 56 | return -self.log_p_data_no_split 57 | 58 | y_sorted = self.y[sort_indices] 59 | 60 | # compute data likelihoods of all possible splits along this projection and find split with highest data likelihood 61 | n_dim = self.X.shape[1] 62 | log_p_data_split = self.compute_log_p_data_split(y_sorted, self.prior, n_dim, split_indices) 63 | i_max = log_p_data_split.argmax() 64 | if log_p_data_split[i_max] >= self.best_log_p_data_split: 65 | best_split_index = split_indices[i_max] 66 | p1 = self.X[sort_indices[best_split_index-1]] 67 | p2 = self.X[sort_indices[best_split_index]] 68 | if not dense: 69 | p1 = p1.toarray()[0] 70 | p2 = p2.toarray()[0] 71 | 72 | hyperplane_origin = 0.5 * (p1 + p2) # middle between the points that are being split 73 | projections_with_origin = projections - np.dot(hyperplane_normal, hyperplane_origin) 74 | cumulative_distances = np.sum(np.abs(projections_with_origin)) 75 | 76 | if log_p_data_split[i_max] > self.best_log_p_data_split: 77 | is_log_p_better_or_same_but_with_better_distance = True 78 | else: 79 | # accept new split with same log(p) only if it increases the cumulative distance of all points to the hyperplane 80 | is_log_p_better_or_same_but_with_better_distance = cumulative_distances > self.best_cumulative_distances 81 | 82 | if is_log_p_better_or_same_but_with_better_distance: 83 | self.best_log_p_data_split = log_p_data_split[i_max] 84 | self.best_cumulative_distances = cumulative_distances 85 | self.best_hyperplane_normal = hyperplane_normal 86 | self.best_hyperplane_origin = hyperplane_origin 87 | 88 | return -log_p_data_split[i_max] 89 | 90 | 91 | class StrMixin: 92 | """Auto-generate `__str__()` and `__repr__()` from attributes.""" 93 | def __str__(self): 94 | attributes = ['{}={}'.format(k, v) for k, v in self.__dict__.items()] 95 | return '{}[{}]'.format(type(self).__name__, ', '.join(attributes)) 96 | 97 | def __repr__(self): 98 | return self.__str__() 99 | 100 | 101 | class HyperplaneOptimizer(ABC, StrMixin): 102 | """ 103 | Abstract base class of all hyperplane optimizers. 104 | """ 105 | 106 | def __init__(self, search_space_is_unit_hypercube): 107 | self.search_space_is_unit_hypercube = search_space_is_unit_hypercube 108 | 109 | @abstractmethod 110 | def solve(self, optimization_function): 111 | raise NotImplementedError 112 | 113 | 114 | class ScipyOptimizer(HyperplaneOptimizer): 115 | """An optimizer using one of the scipy global optimizers, see [1]. 116 | 117 | References 118 | ---------- 119 | .. [1] https://docs.scipy.org/doc/scipy/reference/optimize.html#global-optimization 120 | """ 121 | 122 | def __init__(self, solver_type, seed, **extra_solver_kwargs): 123 | super().__init__(search_space_is_unit_hypercube=True) 124 | 125 | self.solver_type = solver_type 126 | self.seed = seed 127 | self.extra_solver_kwargs = extra_solver_kwargs 128 | 129 | def solve(self, optimization_function): 130 | # bounds for scipy optimizers: unit hypercube (will be mapped to 131 | # (half) hypersphere uniformly later on) 132 | X = optimization_function.X 133 | n_dim = X.shape[1] 134 | unit_hypercube_bounds = np.vstack((np.zeros(n_dim-1), np.ones(n_dim-1))).T 135 | 136 | solver = self.solver_type( 137 | func=optimization_function.compute, 138 | bounds=unit_hypercube_bounds, 139 | seed=self.seed, 140 | **self.extra_solver_kwargs) 141 | 142 | solver.solve() 143 | 144 | 145 | class RandomTwoPointOptimizer(HyperplaneOptimizer): 146 | """ 147 | An optimizer randomly choosing two points of different classes to construct 148 | a bisecting hyperplane (experimental). 149 | TODO: Complete 150 | """ 151 | 152 | def __init__(self, n_mc, seed): 153 | super().__init__(search_space_is_unit_hypercube=False) 154 | 155 | self.n_mc = n_mc 156 | self.seed = seed 157 | 158 | def solve(self, optimization_function): 159 | rand = RandomState(self.seed) 160 | 161 | X = optimization_function.X 162 | y = optimization_function.y 163 | 164 | if np.any(np.round(y) != y): 165 | raise TypeError('Cannot use {} for regression problems as there are no classes to pick points from'.format( 166 | RandomTwoPointOptimizer.__name__)) 167 | 168 | dense = isinstance(X, np.ndarray) 169 | 170 | if len(set(y)) <= 1: 171 | # can't pick two points of different classes if there aren't at least two classes 172 | return 173 | 174 | # find indices of each class 175 | n_classes = int(y.max()) + 1 176 | class_indices = [np.where(y == i)[0] for i in range(n_classes)] 177 | 178 | # evaluate 'n_mc' hyperplane normals passing through two random points form different classes 179 | for i in range(self.n_mc): 180 | indices1 = [] 181 | indices2 = [] 182 | 183 | while len(indices1) == 0 or len(indices2) == 0: 184 | class1 = rand.randint(0, n_classes) 185 | indices1 = class_indices[class1] 186 | 187 | class2 = class1 188 | while class2 == class1: 189 | class2 = rand.randint(0, n_classes) 190 | 191 | indices2 = class_indices[class2] 192 | 193 | p1 = X[indices1[rand.randint(0, len(indices1))]] 194 | p2 = X[indices2[rand.randint(0, len(indices2))]] 195 | if not dense: 196 | p1 = p1.toarray()[0] 197 | p2 = p2.toarray()[0] 198 | 199 | normal = p2-p1 200 | if normal[0] < 0: 201 | normal *= -1 # make sure the first coordinate is positive to match the scipy search space 202 | 203 | optimization_function.compute(normal) 204 | 205 | 206 | class RandomHyperplaneOptimizer(HyperplaneOptimizer): 207 | """ 208 | An optimizer generating hyperplanes with random orientation 209 | in space (experimental). 210 | TODO: Complete 211 | """ 212 | 213 | def __init__(self, n_mc, seed): 214 | super().__init__(search_space_is_unit_hypercube=False) 215 | 216 | self.n_mc = n_mc 217 | self.seed = seed 218 | 219 | def solve(self, optimization_function): 220 | rand = RandomState(self.seed) 221 | 222 | X = optimization_function.X 223 | n_dim = X.shape[1] 224 | 225 | for i in range(self.n_mc): 226 | hyperplane_normal = rand.normal(0, 1, n_dim) 227 | optimization_function.compute(hyperplane_normal) 228 | 229 | 230 | class QuasiRandomHyperplaneOptimizer(HyperplaneOptimizer): 231 | """ 232 | An optimizer generating hyperplanes with quasi-random orientation 233 | in space, see 234 | http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences/ 235 | """ 236 | 237 | def __init__(self, n): 238 | super().__init__(search_space_is_unit_hypercube=True) 239 | 240 | self.n = n 241 | 242 | def solve(self, optimization_function): 243 | X = optimization_function.X 244 | n_dim = X.shape[1] 245 | n_dim_surface = n_dim-1 246 | 247 | # quasi-random R2 sequence 248 | r2gen = r2_series_generator(n_dim_surface) 249 | for i in range(self.n): 250 | uniform = next(r2gen) 251 | optimization_function.compute(uniform) 252 | 253 | 254 | class OptunaOptimizer(HyperplaneOptimizer): 255 | def __init__(self, n_trials, seed): 256 | super().__init__(search_space_is_unit_hypercube=True) 257 | 258 | self.n_trials = n_trials 259 | self.seed = seed 260 | 261 | def solve(self, optimization_function): 262 | from optuna import create_study 263 | from optuna.logging import set_verbosity 264 | from optuna.samplers import TPESampler 265 | 266 | study = create_study(direction='minimize', sampler=TPESampler(self.seed)) 267 | n_dim = optimization_function.X.shape[1] 268 | n_dim_surface = n_dim-1 269 | 270 | def objective(trial): 271 | uniform = np.zeros(n_dim_surface) 272 | for i in range(n_dim_surface): 273 | uniform[i] = trial.suggest_uniform(f'uniform[{i}]', 0, 1) 274 | 275 | return optimization_function.compute(uniform) 276 | 277 | set_verbosity(0) 278 | study.optimize(objective, n_trials=self.n_trials) 279 | 280 | 281 | class SimulatedAnnealingOptimizer(HyperplaneOptimizer): 282 | """ 283 | A simple simulated annealing optimizer (experimental). 284 | TODO: Complete 285 | """ 286 | 287 | def __init__(self, n_scan, n_keep, spread_factor, seed): 288 | super().__init__(search_space_is_unit_hypercube=True) 289 | 290 | self.n_scan = n_scan 291 | self.n_keep = n_keep 292 | self.spread_factor = spread_factor 293 | self.seed = seed 294 | 295 | def solve(self, optimization_function): 296 | rand = RandomState(self.seed) 297 | 298 | X = optimization_function.X 299 | n_dim = X.shape[1]-1 300 | 301 | candidates = {} 302 | 303 | no_improvements = 0 304 | best_value = np.inf 305 | 306 | f = 1 307 | while no_improvements < 50: 308 | if len(candidates) == 0: 309 | # first run 310 | for i in range(self.n_scan): 311 | candidate = rand.uniform(0, 1, n_dim) 312 | value = optimization_function.compute(candidate) 313 | candidates[value] = candidate 314 | else: 315 | # evolution 316 | vectors = list(candidates.values()) 317 | ranges = [np.max([v[i] for v in vectors]) - np.min([v[i] for v in vectors]) for i in range(n_dim)] 318 | 319 | values_sorted = sorted(candidates.keys()) 320 | best_value = values_sorted[0] 321 | for i in range(self.n_keep): 322 | i_candidate = i*len(values_sorted)//self.n_keep 323 | candidate = candidates[values_sorted[i_candidate]] 324 | # perturbation = ranges * rand.uniform(-1, 1, len(ranges)) 325 | perturbation = f * rand.uniform(-1, 1, len(ranges)) 326 | new_candidate = candidate + perturbation 327 | new_candidate = np.clip(new_candidate, 0, 1) 328 | value = optimization_function.compute(new_candidate) 329 | candidates[value] = new_candidate 330 | 331 | f *= self.spread_factor 332 | 333 | # only keep the best candidates 334 | values_sorted = sorted(candidates.keys()) 335 | values_sorted = values_sorted[:self.n_keep] 336 | if values_sorted[0] < best_value: 337 | no_improvements = 0 338 | else: 339 | no_improvements += 1 340 | 341 | candidates = {v: candidates[v] for v in values_sorted} 342 | 343 | 344 | class GradientDescentOptimizer(HyperplaneOptimizer): 345 | """ 346 | A simple gradient descent optimizer (experimental). 347 | TODO: Complete 348 | """ 349 | 350 | def __init__(self, n_init, n_keep): 351 | super().__init__(search_space_is_unit_hypercube=True) 352 | 353 | self.n_init = n_init 354 | self.n_keep = n_keep 355 | 356 | def solve(self, optimization_function): 357 | X = optimization_function.X 358 | n_dim = X.shape[1]-1 359 | 360 | rand = RandomState(666) 361 | 362 | candidates = {} 363 | 364 | no_improvements = 0 365 | best_value = np.inf 366 | 367 | start_delta = 1e-6 368 | while no_improvements < 3: 369 | if len(candidates) == 0: 370 | # first run 371 | for i in range(self.n_init): 372 | candidate = rand.uniform(0, 1, n_dim) 373 | value = optimization_function.compute(candidate) 374 | candidates[value] = candidate 375 | else: 376 | # compute numerical gradient for each of the best vectors 377 | values_sorted = sorted(candidates.keys()) 378 | best_value = values_sorted[0] 379 | for i in range(self.n_keep): 380 | i_candidate = i*len(values_sorted)//self.n_keep 381 | value = values_sorted[i_candidate] 382 | candidate = candidates[value] 383 | 384 | gradient = np.zeros(n_dim) 385 | delta = start_delta 386 | 387 | while True: 388 | delta_too_small = False 389 | 390 | for i_dim in range(n_dim): 391 | new_candidate = candidate.copy() 392 | new_candidate[i_dim] += delta 393 | if new_candidate[i_dim] > 1: 394 | delta *= -1 395 | new_candidate[i_dim] = candidate[i_dim] + delta 396 | 397 | new_value = optimization_function.compute(new_candidate) 398 | gradient[i_dim] = (new_value - value) / delta 399 | delta = np.abs(delta) 400 | if gradient[i_dim] == 0: 401 | delta_too_small = True 402 | break 403 | 404 | if delta_too_small: 405 | delta *= 10 406 | if delta >= 1: 407 | # can't compute gradient, so give up 408 | break 409 | else: 410 | break 411 | 412 | if delta_too_small: 413 | continue 414 | 415 | start_delta = delta / 10 416 | 417 | # add gradient to vector 418 | lambda_ = 1e-6 419 | best_new_candidate = candidate 420 | best_new_value = value 421 | while True: 422 | new_candidate = candidate - lambda_ * gradient 423 | new_candidate = np.clip(new_candidate, 0, 1) 424 | new_value = optimization_function.compute(new_candidate) 425 | if new_value < best_new_value: 426 | lambda_ *= 2 427 | best_new_candidate = new_candidate 428 | best_new_value = new_value 429 | else: 430 | break 431 | 432 | candidates[best_new_value] = best_new_candidate 433 | 434 | # only keep the best candidates 435 | values_sorted = sorted(candidates.keys()) 436 | values_sorted = values_sorted[:self.n_keep] 437 | if values_sorted[0] < best_value: 438 | no_improvements = 0 439 | else: 440 | no_improvements += 1 441 | 442 | candidates = {v: candidates[v] for v in values_sorted} 443 | -------------------------------------------------------------------------------- /bayesian_decision_tree/regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module declares the Bayesian regression tree models: 3 | * PerpendicularRegressionTree 4 | * HyperplaneRegressionTree 5 | """ 6 | import numpy as np 7 | from abc import ABC 8 | from scipy.special import gammaln 9 | from sklearn.base import RegressorMixin 10 | 11 | from bayesian_decision_tree.base import BaseTree 12 | from bayesian_decision_tree.base_hyperplane import BaseHyperplaneTree 13 | from bayesian_decision_tree.base_perpendicular import BasePerpendicularTree 14 | 15 | 16 | class BaseRegressionTree(BaseTree, ABC, RegressorMixin): 17 | """ 18 | Abstract base class of all Bayesian regression trees (perpendicular and hyperplane). Performs 19 | medium-level fitting and prediction tasks and outsources the low-level work to subclasses. 20 | """ 21 | 22 | def __init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level=0): 23 | BaseTree.__init__(self, partition_prior, prior, delta, prune, child_type, True, split_precision, level) 24 | 25 | def _check_target(self, y): 26 | if y.ndim != 1: 27 | raise ValueError('y should have 1 dimension but has {}'.format(y.ndim)) 28 | 29 | def _compute_log_p_data_no_split(self, y, prior): 30 | y_sum = y.sum() 31 | y_squared_sum = (y ** 2).sum() 32 | n = len(y) 33 | mu_post, kappa_post, alpha_post, beta_post = self._compute_posterior_internal(prior, n, y_sum, y_squared_sum) 34 | log_p_prior = np.log(1 - self.partition_prior**(1 + self.level)) 35 | log_p_data = self._compute_log_p_data(prior, alpha_post, beta_post, kappa_post, n) 36 | 37 | return log_p_prior + log_p_data 38 | 39 | def _compute_log_p_data_split(self, y, prior, n_dim, split_indices): 40 | n = len(y) 41 | n1 = np.arange(1, n) 42 | n2 = n - n1 43 | y_sum1 = y.cumsum()[:-1] 44 | y_sum2 = y.sum() - y_sum1 45 | y_squared_sum1 = (y[:-1] ** 2).cumsum() 46 | y_squared_sum2 = (y ** 2).sum() - y_squared_sum1 47 | 48 | if len(split_indices) != len(y)-1: 49 | # we are *not* splitting between all data points -> indexing necessary 50 | split_indices_minus_1 = split_indices - 1 51 | 52 | n1 = n1[split_indices_minus_1] 53 | n2 = n2[split_indices_minus_1] 54 | y_sum1 = y_sum1[split_indices_minus_1] 55 | y_sum2 = y_sum2[split_indices_minus_1] 56 | y_squared_sum1 = y_squared_sum1[split_indices_minus_1] 57 | y_squared_sum2 = y_squared_sum2[split_indices_minus_1] 58 | 59 | mu1, kappa1, alpha1, beta1 = self._compute_posterior_internal(prior, n1, y_sum1, y_squared_sum1) 60 | mu2, kappa2, alpha2, beta2 = self._compute_posterior_internal(prior, n2, y_sum2, y_squared_sum2) 61 | 62 | n_splits = len(split_indices) 63 | log_p_prior = np.log(self.partition_prior**(1+self.level) / (n_splits * n_dim)) 64 | 65 | log_p_data1 = self._compute_log_p_data(prior, alpha1, beta1, kappa1, n1) 66 | log_p_data2 = self._compute_log_p_data(prior, alpha2, beta2, kappa2, n2) 67 | 68 | return log_p_prior + log_p_data1 + log_p_data2 69 | 70 | def _get_prior(self, n_data, n_dim): 71 | if self.prior is not None: 72 | return self.prior 73 | else: 74 | # TODO: use actual data to compute mu and tau 75 | prior_pseudo_observation_count = max(1, n_data//100) 76 | mu = 0 77 | tau = 1 78 | kappa = prior_pseudo_observation_count 79 | alpha = prior_pseudo_observation_count/2 80 | beta = alpha/tau 81 | return np.array([mu, kappa, alpha, beta]) 82 | 83 | def _compute_posterior(self, y, prior, delta=1): 84 | if delta == 0: 85 | return prior 86 | 87 | n = len(y) 88 | y_sum = y.sum() 89 | y_squared_sum = (y ** 2).sum() 90 | 91 | return self._compute_posterior_internal(prior, n, y_sum, y_squared_sum, delta) 92 | 93 | def _compute_posterior_internal(self, prior, n, y_sum, y_squared_sum, delta=1): 94 | mu, kappa, alpha, beta = prior 95 | 96 | # see https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf, equations (86) - (89) 97 | n_delta = n*delta 98 | kappa_post = kappa + n_delta 99 | mu_post = (kappa * mu + n_delta * y_sum / n) / kappa_post 100 | alpha_post = alpha + 0.5*n_delta 101 | beta_post = beta + 0.5 * delta * (y_squared_sum - y_sum ** 2 / n) + 0.5 * kappa * n_delta * ( 102 | y_sum / n - mu) ** 2 / (kappa + n) 103 | 104 | return mu_post, kappa_post, alpha_post, beta_post 105 | 106 | def _compute_posterior_mean(self): 107 | return self.posterior_[0] # mu is the posterior mean 108 | 109 | def _compute_log_p_data(self, prior, alpha_new, beta_new, kappa_new, n_new): 110 | mu, kappa, alpha, beta = prior 111 | 112 | # see https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf, equation (95) 113 | return (gammaln(alpha_new) - gammaln(alpha) 114 | + alpha*np.log(beta) - alpha_new*np.log(beta_new) 115 | + 0.5*np.log(kappa/kappa_new) 116 | - 0.5*n_new*np.log(2*np.pi)) 117 | 118 | def _predict_leaf(self): 119 | # predict posterior mean 120 | return self._compute_posterior_mean() 121 | 122 | def _get_raw_leaf_data_internal(self): 123 | # prior and posterior raw data 124 | return np.array([self.prior, self.posterior_]) 125 | 126 | 127 | class PerpendicularRegressionTree(BasePerpendicularTree, BaseRegressionTree): 128 | """ 129 | Bayesian regression tree using axes-normal splits ("perpendicular"). 130 | Uses a Normal-gamma(mu, kappa, alpha, beta) prior assuming unknown mean and unknown variance. 131 | 132 | Parameters 133 | ---------- 134 | partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9 135 | The prior probability of splitting a node's data into two children. 136 | 137 | Small values tend to reduce the tree depth, leading to less expressiveness 138 | but also to less overfitting. 139 | 140 | Large values tend to increase the tree depth and thus lead to the tree 141 | better fitting the data, which can lead to overfitting. 142 | 143 | prior : array_like, shape = [4] 144 | The prior hyperparameters [mu, kappa, alpha, beta] of the Normal-gamma 145 | distribution (see also [1], [2], [3]): 146 | 147 | - mu: prior pseudo-observation sample mean 148 | - kappa: prior pseudo-observation count used to compute mu 149 | - alpha: (prior pseudo-observation count used to compute sample variance)/2 150 | - beta: alpha * (prior pseudo-observation sample variance) 151 | 152 | It is usually easier to compute these hyperparameters off more intuitive 153 | base quantities, see examples section. 154 | 155 | delta : float, default=0.0 156 | Determines the strengthening of the prior as the tree grows deeper, 157 | see [1]. Must be a value between 0.0 and 1.0. 158 | 159 | split_precision : float, default=0.0 160 | Determines the minimum distance between two contiguous points to consider a split. If the distance is below 161 | this threshold, the points are considered to overlap along this direction. 162 | 163 | level : DO NOT SET, ONLY USED BY SUBCLASSES 164 | 165 | See also 166 | -------- 167 | demo_regression_perpendicular.py 168 | PerpendicularClassificationTree 169 | HyperplaneRegressionTree 170 | 171 | References 172 | ---------- 173 | 174 | .. [1] https://en.wikipedia.org/wiki/Normal-gamma_distribution 175 | 176 | .. [2] https://en.wikipedia.org/wiki/Normal-gamma_distribution#Interpretation_of_parameters 177 | 178 | .. [3] https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions 179 | 180 | Examples 181 | -------- 182 | It is usually convenient to compute the prior hyperparameters as follows: 183 | 184 | >>> # prior mean; set to the mean of the target 185 | >>> mu = ... 186 | >>> 187 | >>> # prior standard deviation; set to about 0.1 times the standard deviation of the target 188 | >>> sd_prior = ... 189 | >>> 190 | >>> # the number of prior pseudo-observations; set to roughly 1 - 10 % of the number of training samples 191 | >>> prior_pseudo_observations = ... 192 | >>> 193 | >>> # now compute the prior 194 | >>> kappa = prior_pseudo_observations 195 | >>> alpha = prior_pseudo_observations/2 196 | >>> beta = alpha*sd_prior**2 197 | >>> prior = [mu, kappa, alpha, beta] 198 | 199 | See `demo_regression_perpendicular.py`. 200 | """ 201 | 202 | def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, split_precision=0.0, level=0): 203 | child_type = PerpendicularRegressionTree 204 | BasePerpendicularTree.__init__(self, partition_prior, prior, delta, prune, child_type, True, split_precision, level) 205 | BaseRegressionTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level) 206 | 207 | 208 | class HyperplaneRegressionTree(BaseHyperplaneTree, BaseRegressionTree): 209 | """ 210 | Bayesian regression tree using arbitrarily-oriented hyperplane splits. 211 | Uses a Normal-gamma(mu, kappa, alpha, beta) prior assuming unknown mean and unknown variance. 212 | 213 | Parameters 214 | ---------- 215 | partition_prior : float, must be > 0.0 and < 1.0, typical value: 0.9 216 | The prior probability of splitting a node's data into two children. 217 | 218 | Small values tend to reduce the tree depth, leading to less expressiveness 219 | but also to less overfitting. 220 | 221 | Large values tend to increase the tree depth and thus lead to the tree 222 | better fitting the data, which can lead to overfitting. 223 | 224 | prior : array_like, shape = [4] 225 | The prior hyperparameters [mu, kappa, alpha, beta] of the Normal-gamma 226 | distribution (see also [1], [2], [3]): 227 | 228 | - mu: prior pseudo-observation sample mean 229 | - kappa: prior pseudo-observation count used to compute mu 230 | - alpha: (prior pseudo-observation count used to compute sample variance)/2 231 | - beta: alpha * (prior pseudo-observation sample variance) 232 | 233 | It is usually easier to compute these hyperparameters off more intuitive 234 | base quantities, see examples section. 235 | 236 | delta : float, default=0.0 237 | Determines the strengthening of the prior as the tree grows deeper, 238 | see [1]. Must be a value between 0.0 and 1.0. 239 | 240 | optimizer : object 241 | A global optimization algorithm object that performs optimal hyperparameter 242 | orientation search. The available options are (in the order in which you should 243 | try them): 244 | - ScipyOptimizer: A wrapper around scipy global optimizers. See usages for examples. 245 | - SimulatedAnnealingOptimizer: Experimental, but works well with n_scan=20, n_keep=10, spread_factor=0.95 246 | - RandomHyperplaneOptimizer: Experimental, mediocre performance 247 | - RandomTwoPointOptimizer: Experimental, mediocre performance 248 | - GradientDescentOptimizer: Experimental, mediocre performance 249 | 250 | split_precision : float, default=0.0 251 | Determines the minimum distance between two contiguous points to consider a split. If the distance is below 252 | this threshold, the points are considered to overlap along this direction. 253 | 254 | level : DO NOT SET, ONLY USED BY SUBCLASSES 255 | 256 | See also 257 | -------- 258 | demo_regression_hyperplane.py 259 | HyperplaneClassificationTree 260 | PerpendicularRegressionTree 261 | 262 | References 263 | ---------- 264 | 265 | .. [1] https://en.wikipedia.org/wiki/Normal-gamma_distribution 266 | 267 | .. [2] https://en.wikipedia.org/wiki/Normal-gamma_distribution#Interpretation_of_parameters 268 | 269 | .. [3] https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions 270 | 271 | Examples 272 | -------- 273 | It is usually convenient to compute the prior hyperparameters in the same manner as for 274 | the perpendicular case, see PerpendicularRegressionTree. 275 | 276 | See `demo_regression_hyperplane.py`. 277 | """ 278 | 279 | def __init__(self, partition_prior=0.99, prior=None, delta=0, prune=False, optimizer=None, split_precision=0.0, level=0): 280 | child_type = HyperplaneRegressionTree 281 | BaseHyperplaneTree.__init__(self, partition_prior, prior, delta, prune, child_type, True, optimizer, split_precision, level) 282 | BaseRegressionTree.__init__(self, partition_prior, prior, delta, prune, child_type, split_precision, level) 283 | -------------------------------------------------------------------------------- /bayesian_decision_tree/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | import numpy as np 4 | from scipy.special import betaln, gammaln 5 | 6 | 7 | def multivariate_betaln(alphas): 8 | if len(alphas) == 2: 9 | return betaln(alphas[0], alphas[1]) 10 | else: 11 | # see https://en.wikipedia.org/wiki/Beta_function#Multivariate_beta_function 12 | return np.sum([gammaln(alpha) for alpha in alphas], axis=0) - gammaln(np.sum(alphas)) 13 | 14 | 15 | def r2_series_generator(n_dim: int) -> Generator[np.ndarray, None, None]: 16 | """ 17 | Computes R2 pseudo-random sequence, see 18 | http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences/ 19 | 20 | :param n_dim: The number of dimensions of the output 21 | :return: R2 series data points 22 | """ 23 | 24 | if n_dim == 0: 25 | raise ValueError(f'n_dim must be > 0 but was {n_dim}') 26 | 27 | # compute phi 28 | phi = 2 29 | phi_old = phi 30 | while True: 31 | phi = pow(1+phi, 1/(n_dim+1)) 32 | if phi == phi_old: 33 | break 34 | 35 | phi_old = phi 36 | 37 | # compute alpha array 38 | alpha = 1/phi**(1+np.arange(n_dim)) 39 | 40 | # compute R2 sequence 41 | i = 0 42 | while True: 43 | yield (0.5 + alpha * (i+1)) % 1 44 | i += 1 45 | 46 | 47 | def hypercube_to_hypersphere_surface( 48 | hypercube_points: np.ndarray, 49 | half_hypersphere: bool) -> np.ndarray: 50 | """ 51 | Converts uniformly distributed points from a D-dimensional hypercube, [0, 1]^D, 52 | to uniformly distributed points on the the D-dimensional surface of a hyperplane 53 | (embedded in (D+1)-dimensional space), see algorithm 'YPHL' in 54 | https://core.ac.uk/download/pdf/82404670.pdf with 'n' = D and 'd' = 0 (specifying 55 | the hypersphere surface rather than the volume) 56 | 57 | :param hypercube_points: A 2-dimensional array of shape N * D 58 | :param half_hypersphere: If True then map the uniform points to the half-hypersphere; 59 | if False then map to the full hypersphere 60 | :return: 61 | """ 62 | 63 | assert 1 <= hypercube_points.ndim <= 2 64 | assert np.all(hypercube_points >= 0) 65 | assert np.all(hypercube_points <= 1) 66 | 67 | n_dim_surface = hypercube_points.shape[-1] 68 | n_dim_embedding = 1+n_dim_surface 69 | if hypercube_points.ndim == 1: 70 | hypercube_points = hypercube_points.reshape(1, -1) 71 | n_points = 1 72 | else: 73 | n_points = hypercube_points.shape[0] 74 | 75 | surface_points = np.zeros((n_dim_embedding, n_points)) 76 | 77 | hypercube_points = hypercube_points.T # easier if 1st index is the dimension 78 | 79 | if n_dim_embedding % 2 == 0: 80 | # even 81 | phi = np.pi * (hypercube_points[0] - 0.5) if half_hypersphere else 2 * np.pi * hypercube_points[0] 82 | surface_points[0] = np.cos(phi) 83 | surface_points[1] = np.sin(phi) 84 | 85 | for i in range(1, n_dim_embedding//2): 86 | u = hypercube_points[2*i-1] 87 | h = u ** (1/(2*i)) 88 | surface_points[:2*i] *= h 89 | 90 | sqrt_rho = np.sqrt(np.maximum(0, 1-np.sum(surface_points[:2*i]**2, axis=0))) 91 | phi = 2*np.pi * hypercube_points[2*i] 92 | surface_points[2*i] = sqrt_rho*np.cos(phi) 93 | surface_points[2*i+1] = sqrt_rho*np.sin(phi) 94 | else: 95 | # odd 96 | if half_hypersphere: 97 | surface_points[0] = 1 98 | next_dim = 1 99 | else: 100 | # see https://mathworld.wolfram.com/SpherePointPicking.html 101 | assert n_dim_embedding >= 3 102 | 103 | phi = np.arccos(2 * hypercube_points[0] - 1) 104 | theta = 2 * np.pi * hypercube_points[1] 105 | surface_points[0] = np.sin(phi) * np.cos(theta) 106 | surface_points[1] = np.sin(phi) * np.sin(theta) 107 | surface_points[2] = np.cos(phi) 108 | next_dim = 2 109 | 110 | # # **old algorithm, flawed** 111 | # # in theory x[0] should be the random sign (+/- 1) which would require another 112 | # # random number, but we don't have that available, so generate pseudo-random 113 | # # bits from two sources: the data itself (even/odd bit count) and a bit from 114 | # # a deterministic quasi-random sequence 115 | # pseudo_random_bits_data = 1 * np.array([np.sum(list(struct.pack('!d', value))) % 2 == 0 for value in hypercube_points.flatten()]) 116 | # pseudo_random_bits_data = pseudo_random_bits_data.reshape(hypercube_points.shape) 117 | # pseudo_random_bits_data = np.sum(pseudo_random_bits_data, axis=0) % 2 == 0 118 | # 119 | # r2gen = r2_series_generator(n_dim=1) 120 | # pseudo_random_bits_gen = np.array([next(r2gen)[0] > 0.5 for i in range(hypercube_points.shape[1])]) 121 | # 122 | # pseudo_random_bits = pseudo_random_bits_data ^ pseudo_random_bits_gen 123 | # surface_points[0] = 2*pseudo_random_bits-1 124 | # next_dim = 1 125 | 126 | for i in range(next_dim, (n_dim_embedding + 1) // 2): 127 | u = hypercube_points[2 * i - 2] 128 | h = u ** (1 / (2 * i - 1)) 129 | surface_points[:2 * i - 1] *= h 130 | 131 | sqrt_rho = np.sqrt(np.maximum(0, 1 - np.sum(surface_points[:2 * i - 1] ** 2, axis=0))) 132 | phi = 2 * np.pi * hypercube_points[2 * i - 1] 133 | surface_points[2 * i - 1] = sqrt_rho * np.cos(phi) 134 | surface_points[2 * i] = sqrt_rho * np.sin(phi) 135 | 136 | surface_points = surface_points.squeeze().T 137 | surface_points = (surface_points.T / np.linalg.norm(surface_points, axis=-1)).T # correct numerical round-off errors 138 | 139 | return surface_points 140 | -------------------------------------------------------------------------------- /conda.recipe/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.6 3 | - 3.7.1 4 | -------------------------------------------------------------------------------- /conda.recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | 2 | {% set data = load_setup_py_data() %} 3 | 4 | package: 5 | name: bayesian_decision_tree 6 | 7 | version: {{ data['version'] }} 8 | 9 | source: 10 | path: .. 11 | 12 | build: 13 | # If the installation is complex, or different between Unix and Windows, use 14 | # separate bld.bat and build.sh files instead of this key. Add the line 15 | # "skip: True # [py<35]" (for example) to limit to Python 3.5 and newer, or 16 | # "skip: True # [not win]" to limit to Windows. 17 | script: python setup.py install --single-version-externally-managed --record=record.txt 18 | # if not platform dependent uncomment 19 | # uncomment for entry points to be generated 20 | # entry_points: 21 | # - hadoop_utils = hadoop_utils.cli:cli 22 | 23 | requirements: 24 | build: 25 | - python 26 | - setuptools 27 | run: 28 | - python 29 | {% for dep in data['install_requires'] %} 30 | - {{ dep.lower() }} 31 | {% endfor %} 32 | {# raw is for ignoring templating with cookiecutter, leaving it for use with conda-build #} 33 | 34 | test: 35 | source_files: 36 | - tests 37 | requires: 38 | - pytest 39 | - pytest-cov 40 | - pytest-flake8 41 | # - pytest-mypy 42 | - teamcity-messages 43 | commands: 44 | - pytest 45 | 46 | about: 47 | home: https://github.com/AA42557-QUAD-DS/bayesian_tree 48 | summary: Short description 49 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UBS-IB/bayesian_tree/718aecc68e7ea527380b8e299b4f7d69e86f7400/examples/__init__.py -------------------------------------------------------------------------------- /examples/demo_classification_hyperplane.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import accuracy_score 3 | 4 | from bayesian_decision_tree.classification import HyperplaneClassificationTree 5 | from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer 6 | from examples import helper 7 | 8 | # demo script for classification (binary or multiclass) using arbitrarily oriented hyperplanes 9 | if __name__ == '__main__': 10 | # proxies (in case you're running this behind a firewall) 11 | args = helper.parse_args() 12 | proxies = { 13 | 'http': args.http_proxy, 14 | 'https': args.https_proxy 15 | } 16 | 17 | # data set: uncomment one of the following sections 18 | 19 | # artificial 4-class data somewhat similar to the Ripley data 20 | # n_train = 500 21 | # n_test = 2000 22 | # x0 = [1, 3, 2, 4] 23 | # x1 = [1, 1, 3, 3] 24 | # sd = 0.7 25 | # X_train = np.zeros((n_train, 2)) 26 | # y_train = np.zeros((n_train, 1)) 27 | # X_test = np.zeros((n_test, 2)) 28 | # y_test = np.zeros((n_test, 1)) 29 | # np.random.seed(666) 30 | # for i in range(4): 31 | # X_train[i * n_train//4:(i + 1) * n_train//4, 0] = np.random.normal(x0[i], sd, n_train//4) 32 | # X_train[i * n_train//4:(i + 1) * n_train//4, 1] = np.random.normal(x1[i], sd, n_train//4) 33 | # y_train[i * n_train//4:(i + 1) * n_train//4] = i 34 | # 35 | # X_test[i * n_test//4:(i + 1) * n_test//4, 0] = np.random.normal(x0[i], sd, n_test//4) 36 | # X_test[i * n_test//4:(i + 1) * n_test//4, 1] = np.random.normal(x1[i], sd, n_test//4) 37 | # y_test[i * n_test//4:(i + 1) * n_test//4] = i 38 | # train = np.hstack((X_train, y_train)) 39 | # test = np.hstack((X_test, y_test)) 40 | 41 | np.random.seed(5) 42 | 43 | n = 10000 44 | X_train = np.random.uniform(0, 4, (n, 2)) 45 | y_train = np.zeros((n, 1)) 46 | y_train[(X_train[:, 0] >= 1) & (X_train[:, 0] < 2) & (X_train[:, 1] <= 3)] = 1 47 | y_train[(X_train[:, 0] >= 2) & (X_train[:, 0] < 3) & (X_train[:, 1] <= 1)] = 1 48 | y_train[(X_train[:, 0] >= 3)] = 1 49 | 50 | angle = 30*np.pi/180 51 | X_train_rot = X_train.copy() 52 | X_train_rot[:, 0] = np.cos(angle)*X_train[:, 0] + np.sin(angle)*X_train[:, 1] 53 | X_train_rot[:, 1] = -np.sin(angle)*X_train[:, 0] + np.cos(angle)*X_train[:, 1] 54 | X_train = X_train_rot 55 | 56 | train = np.hstack((X_train, y_train)) 57 | test = train 58 | 59 | # or, alternatively, load a UCI dataset 60 | # train, test = helper.load_ripley(proxies) 61 | 62 | n_classes = len(np.unique(train[:, -1])) 63 | 64 | if train is test: 65 | # perform a 50:50 train:test split if no test data is given 66 | train = train[0::2] 67 | test = test[1::2] 68 | 69 | X_train = train[:, :-1] 70 | y_train = train[:, -1] 71 | X_test = test[:, :-1] 72 | y_test = test[:, -1] 73 | 74 | # prior 75 | prior_pseudo_observations = 100 76 | prior = prior_pseudo_observations * np.ones(n_classes) 77 | 78 | # model 79 | model = HyperplaneClassificationTree( 80 | partition_prior=0.9, 81 | prior=prior, 82 | delta=0, 83 | prune=True, 84 | optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666)) 85 | 86 | model.fit(X_train, y_train) 87 | # train 88 | print(model) 89 | print() 90 | print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves())) 91 | print('Feature importance:', model.feature_importance()) 92 | 93 | # compute accuracy 94 | y_pred_train = model.predict(X_train) 95 | y_pred_test = model.predict(X_test) 96 | accuracy_train = accuracy_score(y_train, y_pred_train) 97 | accuracy_test = accuracy_score(y_test, y_pred_test) 98 | info_train = 'Train accuracy: {:.4f} %'.format(100 * accuracy_train) 99 | info_test = 'Test accuracy: {:.4f} %'.format(100 * accuracy_test) 100 | print(info_train) 101 | print(info_test) 102 | 103 | # plot if 2D 104 | dimensions = X_train.shape[1] 105 | if dimensions == 2: 106 | helper.plot_2d_hyperplane(model, X_train, y_train, info_train, X_test, y_test, info_test) 107 | -------------------------------------------------------------------------------- /examples/demo_classification_perpendicular.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import accuracy_score 3 | 4 | from bayesian_decision_tree.classification import PerpendicularClassificationTree 5 | from examples import helper 6 | 7 | # demo script for classification (binary or multiclass) using classic, axis-normal splits 8 | if __name__ == '__main__': 9 | # proxies (in case you're running this behind a firewall) 10 | args = helper.parse_args() 11 | proxies = { 12 | 'http': args.http_proxy, 13 | 'https': args.https_proxy 14 | } 15 | 16 | # data set: uncomment one of the following sections 17 | 18 | # artificial 4-class data somewhat similar to the Ripley data 19 | n_train = 500 20 | n_test = 2000 21 | x0 = [1, 3, 2, 4] 22 | x1 = [1, 1, 3, 3] 23 | sd = 0.7 24 | X_train = np.zeros((n_train, 2)) 25 | y_train = np.zeros((n_train, 1)) 26 | X_test = np.zeros((n_test, 2)) 27 | y_test = np.zeros((n_test, 1)) 28 | np.random.seed(666) 29 | for i in range(4): 30 | X_train[i * n_train//4:(i + 1) * n_train//4, 0] = np.random.normal(x0[i], sd, n_train//4) 31 | X_train[i * n_train//4:(i + 1) * n_train//4, 1] = np.random.normal(x1[i], sd, n_train//4) 32 | y_train[i * n_train//4:(i + 1) * n_train//4] = i 33 | 34 | X_test[i * n_test//4:(i + 1) * n_test//4, 0] = np.random.normal(x0[i], sd, n_test//4) 35 | X_test[i * n_test//4:(i + 1) * n_test//4, 1] = np.random.normal(x1[i], sd, n_test//4) 36 | y_test[i * n_test//4:(i + 1) * n_test//4] = i 37 | train = np.hstack((X_train, y_train)) 38 | test = np.hstack((X_test, y_test)) 39 | 40 | # np.random.seed(5) 41 | # 42 | # n = 10000 43 | # X_train = np.random.uniform(0, 4, (n, 2)) 44 | # y_train = np.zeros((n, 1)) 45 | # y_train[(X_train[:, 0] >= 1) & (X_train[:, 0] < 2) & (X_train[:, 1] <= 3)] = 1 46 | # y_train[(X_train[:, 0] >= 2) & (X_train[:, 0] < 3) & (X_train[:, 1] <= 1)] = 1 47 | # y_train[(X_train[:, 0] >= 3)] = 1 48 | # 49 | # angle = 30*np.pi/180 50 | # X_train_rot = X_train.copy() 51 | # X_train_rot[:, 0] = np.cos(angle)*X_train[:, 0] + np.sin(angle)*X_train[:, 1] 52 | # X_train_rot[:, 1] = -np.sin(angle)*X_train[:, 0] + np.cos(angle)*X_train[:, 1] 53 | # X_train = X_train_rot 54 | # 55 | # train = np.hstack((X_train, y_train)) 56 | # test = train 57 | 58 | # or, alternatively, load a UCI dataset 59 | # train, test = helper.load_ripley(proxies) 60 | 61 | n_classes = len(np.unique(train[:, -1])) 62 | 63 | if train is test: 64 | # perform a 50:50 train:test split if no test data is given 65 | train = train[0::2] 66 | test = test[1::2] 67 | 68 | X_train = train[:, :-1] 69 | y_train = train[:, -1] 70 | X_test = test[:, :-1] 71 | y_test = test[:, -1] 72 | 73 | # prior 74 | prior_pseudo_observations = 1 75 | prior = prior_pseudo_observations * np.ones(n_classes) 76 | 77 | # model 78 | model = PerpendicularClassificationTree( 79 | partition_prior=0.9, 80 | prior=prior, 81 | delta=0, 82 | prune=True) 83 | 84 | # train 85 | model.fit(X_train, y_train) 86 | print(model) 87 | print() 88 | print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves())) 89 | print('Feature importance:', model.feature_importance()) 90 | 91 | # compute accuracy 92 | y_pred_train = model.predict(X_train) 93 | y_pred_test = model.predict(X_test) 94 | accuracy_train = accuracy_score(y_train, y_pred_train) 95 | accuracy_test = accuracy_score(y_test, y_pred_test) 96 | info_train = 'Train accuracy: {:.4f} %'.format(100 * accuracy_train) 97 | info_test = 'Test accuracy: {:.4f} %'.format(100 * accuracy_test) 98 | print(info_train) 99 | print(info_test) 100 | 101 | # plot if 1D or 2D 102 | dimensions = X_train.shape[1] 103 | if dimensions == 1: 104 | helper.plot_1d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test) 105 | elif dimensions == 2: 106 | helper.plot_2d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test) 107 | -------------------------------------------------------------------------------- /examples/demo_classification_trading.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from scipy.linalg import expm, inv, eig 4 | from sklearn.metrics import accuracy_score, plot_confusion_matrix 5 | from sklearn.neural_network import MLPClassifier 6 | 7 | from bayesian_decision_tree.classification import PerpendicularClassificationTree 8 | 9 | 10 | def get_covariance(sigma: float, delta: float, theta: np.ndarray) -> np.ndarray: 11 | theta_p = theta + theta.T 12 | return (sigma ** 2.0) * inv(theta_p) * (np.eye(theta.shape[0]) - expm(-theta_p * delta)) 13 | 14 | 15 | def sample_gaussian(n: int, covariance: np.ndarray) -> np.ndarray: 16 | d, v = eig(covariance) 17 | a = np.dot(v, np.diag(np.sqrt(np.real(d)))) 18 | g = np.random.normal(0.0, 1.0, (a.shape[0], n)) 19 | return np.dot(a, g) 20 | 21 | 22 | def sample_mean_reversion(n: int, x0: np.ndarray, mu: np.ndarray, sigma: float, delta: float, 23 | theta: np.ndarray) -> np.ndarray: 24 | if not positive_eigenvalues(theta): 25 | raise AssertionError("Input theta does not have all positive eigenvalues") 26 | covariance = get_covariance(sigma, delta, theta) 27 | if not positive_eigenvalues(covariance): 28 | raise AssertionError("Covariance does not have all positive eigenvalues") 29 | gaussian_matrix = sample_gaussian(n, covariance) 30 | sample_paths = np.ndarray(gaussian_matrix.shape) 31 | sample_paths[:, [0]] = x0 32 | exp_theta = expm(-theta * delta) 33 | for i in range(1, sample_paths.shape[1]): 34 | prev = sample_paths[:, [i - 1]] 35 | sample_paths[:, [i]] = mu + np.dot(exp_theta, (prev - mu)) + gaussian_matrix[:, [i - 1]] 36 | return sample_paths 37 | 38 | 39 | def positive_eigenvalues(theta: np.ndarray) -> bool: 40 | d, v = eig(theta) 41 | return np.all(np.real(d) > 0.0) 42 | 43 | 44 | # demo script for classification (binary or multiclass) using classic, axis-normal splits 45 | if __name__ == '__main__': 46 | np.random.seed(0) 47 | default_font_size = 16 48 | model_type = 'tree' # it can be 'tree' or 'nn' 49 | plt.rc('axes', titlesize=default_font_size) # fontsize of the axes title 50 | plt.rc('axes', labelsize=default_font_size) # fontsize of the x and y labels 51 | plt.rc('xtick', labelsize=default_font_size) # fontsize of the tick labels 52 | plt.rc('ytick', labelsize=default_font_size) # fontsize of the tick labels 53 | plt.rc('legend', fontsize=default_font_size) # legend fontsize 54 | plt.rc('figure', titlesize=default_font_size) # fontsize of the figure title 55 | n = 10_000 56 | n += 1 # used for the deltas 57 | mu = np.array([[100.0], [110.0], [105.0]]) 58 | theta = np.array([[2.0, -0.5, 0.0], [0.2, 1.0, 0.0], [0.0, 0.0, 0.1]]) 59 | dt = 0.1 60 | sigma = 1.0 61 | d = mu.shape[0] 62 | paths = sample_mean_reversion(n, mu, mu, sigma, dt, theta) 63 | x = paths.T 64 | plt.plot(x) 65 | plt.hlines(mu, 0, n, linestyles=d * ['--'], zorder=100) 66 | plt.title('Stock prices') 67 | plt.legend(['Stock A', 'Stock B', 'Stock C']) 68 | ax = plt.gca() 69 | ax.set_xlim([0, n]) 70 | ax.set_ylim([90, 120]) 71 | plt.savefig('trading_example_prices.png') 72 | plt.show() 73 | 74 | # artificial 4-class data somewhat similar to the Ripley data 75 | y_diff = np.diff(x, axis=0) 76 | x = x[:-1, :] 77 | y = np.dot((np.sign(y_diff) + 1) / 2, np.reshape(2.0 ** np.arange(d), (d, 1))).astype(int) 78 | n_train = int(x.shape[0] * 0.8) 79 | X_train = x[:n_train, :] 80 | y_train = y[:n_train, :] 81 | X_test = x[n_train:, :] 82 | y_test = y[n_train:, :] 83 | y_diff_test = y_diff[n_train:, :] 84 | n_classes = len(np.unique(y)) 85 | 86 | # prior 87 | prior_strength = 1 88 | prior = prior_strength * np.array(n_classes * [1.0]) / n_classes 89 | 90 | # model 91 | if model_type is 'tree': 92 | model = PerpendicularClassificationTree( 93 | partition_prior=0.9, 94 | prior=prior, 95 | delta=0, 96 | prune=False) 97 | elif model_type is 'nn': 98 | model = MLPClassifier( 99 | hidden_layer_sizes=(10, 10), 100 | random_state=0) 101 | else: 102 | raise AssertionError('Model not included ' + model_type) 103 | 104 | # train 105 | model.fit(X_train, y_train) 106 | print(model) 107 | print() 108 | 109 | # compute accuracy 110 | y_pred_train = model.predict(X_train) 111 | y_pred_test = model.predict(X_test) 112 | positions = (2 * (y_pred_test.reshape((y_pred_test.shape[0], 1)) // 2.0 ** np.arange(d).astype(int) % 2) - 1) 113 | accuracy_train = accuracy_score(y_train, y_pred_train) 114 | accuracy_test = accuracy_score(y_test, y_pred_test) 115 | info_train = 'Train accuracy: {:.4f} %'.format(100 * accuracy_train) 116 | info_test = 'Test accuracy: {:.4f} %'.format(100 * accuracy_test) 117 | print(info_train) 118 | print(info_test) 119 | 120 | pnl = np.cumsum(positions * y_diff_test, axis=0) 121 | plt.plot(pnl) 122 | plt.hlines(0, 0, pnl.shape[0]) 123 | ax = plt.gca() 124 | ax.set_xlim([0, pnl.shape[0]]) 125 | ax.set_ylim(np.array([-30, 200])) 126 | plt.grid(True) 127 | plt.title('Test period PnL') 128 | plt.legend(['Stock A', 'Stock B', 'Stock C']) 129 | plt.savefig('trading_example_pnl_' + model_type + '.png') 130 | plt.show() 131 | 132 | disp = plot_confusion_matrix(model, X_test, y_test, 133 | display_labels=[''.join( 134 | np.core.defchararray.add(['-' if x < 0 else '+' for x in (2 * row - 1)], 135 | ['A', 'B', 'C'])) for row in 136 | np.reshape(np.arange(2 ** d), (2 ** d, 1)) // 2.0 ** np.arange( 137 | d).astype(int) % 2], 138 | cmap=plt.cm.Blues, 139 | normalize='true') 140 | disp.ax_.set_title('Test period confusion matrix') 141 | plt.xticks(rotation=90) 142 | plt.savefig('trading_example_confusion_matrix_' + model_type + '.png', bbox_inches='tight') 143 | plt.show() 144 | -------------------------------------------------------------------------------- /examples/demo_regression_hyperplane.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_squared_error 3 | 4 | from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer 5 | from bayesian_decision_tree.regression import HyperplaneRegressionTree 6 | from examples import helper 7 | 8 | # demo script for regression using using arbitrarily oriented hyperplanes 9 | if __name__ == '__main__': 10 | # proxies (in case you're running this behind a firewall) 11 | args = helper.parse_args() 12 | proxies = { 13 | 'http': args.http_proxy, 14 | 'https': args.https_proxy 15 | } 16 | 17 | # data set: uncomment one of the following sections 18 | 19 | # # synthetic sine wave 20 | # X_train = np.linspace(0, 10, 100).reshape(-1, 1) 21 | # y_train = 1 * np.sin(np.linspace(0, 10, 100)).reshape(-1, 1) 22 | # train = np.hstack((X_train, y_train)) 23 | # test = train 24 | 25 | # or, alternatively, load a UCI dataset (where we *regress* on the class labels, i.e., class 1 = 0.0 and class 2 = 1.0) 26 | train, test = helper.load_ripley(proxies) 27 | 28 | n_dim = len(np.unique(train[:, -1])) 29 | 30 | if train is test: 31 | # perform a 50:50 train:test split if no test data is given 32 | train = train[0::2] 33 | test = test[1::2] 34 | 35 | X_train = train[:, :-1] 36 | y_train = train[:, -1] 37 | X_test = test[:, :-1] 38 | y_test = test[:, -1] 39 | 40 | # prior for regression: Normal-Gamma prior, see https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions 41 | mu = y_train.mean() 42 | sd_prior = y_train.std() / 10 43 | prior_pseudo_observations = 10 44 | kappa = prior_pseudo_observations 45 | alpha = prior_pseudo_observations / 2 46 | var_prior = sd_prior**2 47 | tau_prior = 1/var_prior 48 | beta = alpha/tau_prior 49 | prior = np.array([mu, kappa, alpha, beta]) 50 | 51 | # model 52 | model = HyperplaneRegressionTree( 53 | partition_prior=0.9, 54 | prior=prior, 55 | delta=0, 56 | optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666)) 57 | 58 | # train 59 | model.fit(X_train, y_train) 60 | print(model) 61 | print() 62 | print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves())) 63 | print('Feature importance:', model.feature_importance()) 64 | 65 | # compute RMSE 66 | rmse_train = np.sqrt(mean_squared_error(model.predict(X_train), y_train)) 67 | rmse_test = np.sqrt(mean_squared_error(model.predict(X_test), y_test)) 68 | info_train = 'RMSE train: {:.4f}'.format(rmse_train) 69 | info_test = 'RMSE test: {:.4f}'.format(rmse_test) 70 | print(info_train) 71 | print(info_test) 72 | 73 | # plot if 2D 74 | dimensions = X_train.shape[1] 75 | if dimensions == 2: 76 | helper.plot_2d_hyperplane(model, X_train, y_train, info_train, X_test, y_test, info_test) 77 | -------------------------------------------------------------------------------- /examples/demo_regression_perpendicular.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_squared_error 3 | 4 | from bayesian_decision_tree.regression import PerpendicularRegressionTree 5 | from examples import helper 6 | 7 | # demo script for regression using classic, axis-normal splits 8 | if __name__ == '__main__': 9 | # proxies (in case you're running this behind a firewall) 10 | args = helper.parse_args() 11 | proxies = { 12 | 'http': args.http_proxy, 13 | 'https': args.https_proxy 14 | } 15 | 16 | # data set: uncomment one of the following sections 17 | 18 | # # synthetic sine wave 19 | # X_train = np.linspace(0, 10, 100).reshape(-1, 1) 20 | # y_train = 1 * np.sin(np.linspace(0, 10, 100)).reshape(-1, 1) 21 | # train = np.hstack((X_train, y_train)) 22 | # test = train 23 | 24 | # or, alternatively, load a UCI dataset (where we *regress* on the class labels, i.e., class 1 = 0.0 and class 2 = 1.0) 25 | train, test = helper.load_ripley(proxies) 26 | 27 | n_dim = len(np.unique(train[:, -1])) 28 | 29 | if train is test: 30 | # perform a 50:50 train:test split if no test data is given 31 | train = train[0::2] 32 | test = test[1::2] 33 | 34 | X_train = train[:, :-1] 35 | y_train = train[:, -1] 36 | X_test = test[:, :-1] 37 | y_test = test[:, -1] 38 | 39 | # prior for regression: Normal-Gamma prior, see https://en.wikipedia.org/wiki/Conjugate_prior#Continuous_distributions 40 | mu = y_train.mean() 41 | sd_prior = y_train.std() / 10 42 | prior_pseudo_observations = 1 43 | kappa = prior_pseudo_observations 44 | alpha = prior_pseudo_observations / 2 45 | var_prior = sd_prior**2 46 | tau_prior = 1/var_prior 47 | beta = alpha/tau_prior 48 | prior = np.array([mu, kappa, alpha, beta]) 49 | 50 | # model 51 | model = PerpendicularRegressionTree( 52 | partition_prior=0.9, 53 | prior=prior, 54 | delta=0) 55 | 56 | # train 57 | model.fit(X_train, y_train) 58 | print(model) 59 | print() 60 | print('Tree depth and number of leaves: {}, {}'.format(model.get_depth(), model.get_n_leaves())) 61 | print('Feature importance:', model.feature_importance()) 62 | 63 | # compute RMSE 64 | rmse_train = np.sqrt(mean_squared_error(model.predict(X_train), y_train)) 65 | rmse_test = np.sqrt(mean_squared_error(model.predict(X_test), y_test)) 66 | info_train = 'RMSE train: {:.4f}'.format(rmse_train) 67 | info_test = 'RMSE test: {:.4f}'.format(rmse_test) 68 | print(info_train) 69 | print(info_test) 70 | 71 | # plot if 1D or 2D 72 | dimensions = X_train.shape[1] 73 | if dimensions == 1: 74 | helper.plot_1d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test) 75 | elif dimensions == 2: 76 | helper.plot_2d_perpendicular(model, X_train, y_train, info_train, X_test, y_test, info_test) 77 | -------------------------------------------------------------------------------- /examples/helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of publicly available data sets to test classification models on, 3 | plus some helper functions for plotting. 4 | """ 5 | import argparse 6 | import io 7 | from dataclasses import dataclass 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | import requests 13 | from matplotlib import patches 14 | from sklearn.preprocessing import LabelBinarizer 15 | 16 | 17 | def parse_args(): 18 | """Parse input arguments from the command line 19 | :return: the result from the ArgumentParser 20 | """ 21 | parser = argparse.ArgumentParser( 22 | description="Run demo of binary classification") 23 | 24 | parser.add_argument( 25 | '--http_proxy', 26 | action='store', 27 | required=False, 28 | help='HTTP Proxy', 29 | default=None) 30 | 31 | parser.add_argument( 32 | '--https_proxy', 33 | action='store', required=False, 34 | help='HTTPS Proxy', 35 | default=None) 36 | 37 | return parser.parse_args() 38 | 39 | 40 | def one_hot_encode(data, columns): 41 | columns = sorted(set(columns))[::-1] 42 | 43 | def ensure_matrix(x): 44 | return x if x.ndim == 2 else np.array(x).reshape(-1, 1) 45 | 46 | for c in columns: 47 | one_hot = LabelBinarizer().fit_transform(data[:, c]) 48 | data = np.hstack(( 49 | ensure_matrix(data[:, :c]), 50 | ensure_matrix(one_hot), 51 | ensure_matrix(data[:, c+1:]) 52 | )) 53 | 54 | return data 55 | 56 | 57 | def load_credit(proxies): 58 | content = requests.get( 59 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', 60 | proxies=proxies).content 61 | df = pd.read_excel(io.BytesIO(content)) 62 | train = df.iloc[1:, 1:].values.astype(np.float64) 63 | train = one_hot_encode(train, [2, 3]) # one-hot encode categorical features 64 | test = train 65 | return train, test 66 | 67 | 68 | def load_dermatology(proxies): 69 | # Dermatology 70 | text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data', proxies=proxies).text 71 | lines = text.split('\n') 72 | lines = [line for line in lines if '?' not in line] 73 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 74 | train[:, -1] -= 1 75 | test = train 76 | return train, test 77 | 78 | 79 | def load_diabetic(proxies): 80 | # Diabetic Retinopathy 81 | text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00329/messidor_features.arff', proxies=proxies).text 82 | text = text[text.index('@data'):] 83 | lines = text.split('\n')[1:] 84 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 85 | test = train 86 | return train, test 87 | 88 | 89 | def load_eeg(proxies): 90 | # load EEG eye data 91 | text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff', proxies=proxies).text 92 | text = text[text.index('@DATA'):] 93 | lines = text.split('\n')[1:] 94 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 95 | test = train 96 | return train, test 97 | 98 | 99 | def load_gamma(proxies): 100 | # load MAGIC Gamma telescope data 101 | text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data', proxies=proxies).text 102 | text = text.replace('g', '0').replace('h', '1') 103 | lines = text.split('\n') 104 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 105 | test = train 106 | return train, test 107 | 108 | 109 | def load_glass(proxies): 110 | # load glass identificaion data 111 | text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', proxies=proxies).text 112 | lines = text.split('\n') 113 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 114 | train = train[:, 1:] # ignore ID row 115 | train[:, -1] -= 1 # convert 1..7 to 0..6 116 | train[np.where(train[:, -1] >= 4)[0], -1] -= 1 # skip missing class 117 | test = train 118 | return train, test 119 | 120 | 121 | def load_haberman(proxies): 122 | # load Haberman's dataset 123 | text = requests.get( 124 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data', 125 | proxies=proxies).text 126 | lines = text.split('\n') 127 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 128 | train[:, -1] -= 1 129 | test = train 130 | return train, test 131 | 132 | 133 | def load_heart(proxies): 134 | text = requests.get( 135 | 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', proxies=proxies).text 136 | lines = text.split('\n') 137 | train = np.vstack([np.fromstring(lines[i], sep=' ') for i in range(len(lines)-1)]) 138 | train = one_hot_encode(train, [2, 6, 12]) # one-hot encode categorical features 139 | train[:, -1] -= 1 140 | test = train 141 | return train, test 142 | 143 | 144 | def load_ripley(proxies): 145 | # load Ripley's synthetic dataset 146 | def parse_ripley(text): 147 | lines = text.split('\n')[1:] 148 | return np.vstack([np.fromstring(lines[i], sep=' ') for i in range(len(lines)-1)]) 149 | train = parse_ripley(requests.get('https://www.stats.ox.ac.uk/pub/PRNN/synth.tr', proxies=proxies).text) 150 | test = parse_ripley(requests.get('https://www.stats.ox.ac.uk/pub/PRNN/synth.te', proxies=proxies).text) 151 | return train, test 152 | 153 | 154 | def load_seeds(proxies): 155 | # load wheat seeds dataset 156 | def parse_ripley(text): 157 | lines = text.split('\n') 158 | return np.vstack([np.fromstring(lines[i], sep=' ') for i in range(len(lines)-1)]) 159 | train = parse_ripley(requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt', proxies=proxies).text) 160 | train[:, -1] -= 1 161 | test = train 162 | return train, test 163 | 164 | 165 | def load_seismic(proxies): 166 | # load seismic bumps dataset 167 | text = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff', proxies=proxies).text 168 | text = text[text.index('@data'):] 169 | text = text.replace('a', '0').replace('b', '1').replace('c', '2').replace('d', '3') 170 | text = text.replace('N', '0').replace('W', '1') 171 | lines = text.split('\n')[1:] 172 | train = np.vstack([np.fromstring(lines[i], sep=',') for i in range(len(lines)-1)]) 173 | test = train 174 | return train, test 175 | 176 | 177 | def plot_1d_perpendicular(root, X_train, y_train, info_train, X_test, y_test, info_test): 178 | plt.figure(figsize=[10, 16], dpi=75) 179 | plt.subplot(211) 180 | plt.plot(X_train[:, 0], y_train, 'o-') 181 | plt.title(info_train) 182 | draw_node_1d_perpendicular(root, bounds=(X_train[:, 0].min(), X_train[:, 0].max())) 183 | plt.xlabel('x0') 184 | plt.ylabel('x1') 185 | plt.legend() 186 | plt.gca().set_aspect(1) 187 | 188 | plt.subplot(212) 189 | plt.plot(X_test[:, 0], y_test, 'o-') 190 | draw_node_1d_perpendicular(root, bounds=(X_test[:, 0].min(), X_test[:, 0].max())) 191 | plt.title(info_test) 192 | plt.xlabel('x0') 193 | plt.ylabel('x1') 194 | plt.legend() 195 | plt.gca().set_aspect(1) 196 | 197 | plt.show() 198 | 199 | 200 | def plot_2d_perpendicular(root, X_train, y_train, info_train, X_test, y_test, info_test): 201 | plt.figure(figsize=[10, 16], dpi=75) 202 | 203 | n_classes = int(y_train.max()) + 1 204 | colormap = plt.get_cmap('gist_rainbow') 205 | 206 | def plot(X, y, info): 207 | for i in range(n_classes)[::-1]: 208 | class_i = y == i 209 | plt.plot(X[np.where(class_i)[0], 0], 210 | X[np.where(class_i)[0], 1], 211 | 'o', 212 | ms=4, 213 | c=colormap(i/n_classes), 214 | label='Class {}'.format(i), 215 | alpha=0.5) 216 | 217 | bounds = ((X[:, 0].min(), X[:, 0].max()), (X[:, 1].min(), X[:, 1].max())) 218 | draw_node_2d_perpendicular(root, bounds, colormap, n_classes) 219 | plt.title(info) 220 | plt.xlabel('x0') 221 | plt.ylabel('x1') 222 | plt.legend() 223 | 224 | plt.subplot(211) 225 | plot(X_train, y_train, info_train) 226 | plt.gca().set_aspect(1) 227 | 228 | plt.subplot(212) 229 | plot(X_test, y_test, info_test) 230 | plt.gca().set_aspect(1) 231 | 232 | plt.show() 233 | 234 | 235 | def draw_node_2d_perpendicular(node, bounds, colormap, n_classes): 236 | if node.is_leaf(): 237 | x = bounds[0][0] 238 | y = bounds[1][0] 239 | w = bounds[0][1] - x 240 | h = bounds[1][1] - y 241 | 242 | mean = node._compute_posterior_mean() 243 | if not node.is_regression: 244 | mean = (np.arange(len(mean)) * mean).sum() 245 | 246 | plt.gca().add_patch(patches.Rectangle((x, y), w, h, color=colormap(mean/n_classes), alpha=0.1, linewidth=0)) 247 | else: 248 | draw_node_2d_perpendicular(node.child1_, compute_child_bounds_2d_perpendicular(bounds, node, True), colormap, n_classes) 249 | draw_node_2d_perpendicular(node.child2_, compute_child_bounds_2d_perpendicular(bounds, node, False), colormap, n_classes) 250 | 251 | 252 | def compute_child_bounds_2d_perpendicular(bounds, parent, lower): 253 | b = bounds[parent.split_dimension_] 254 | b = (b[0], min(b[1], parent.split_value_)) if lower else (max(b[0], parent.split_value_), b[1]) 255 | return (b, bounds[1]) if parent.split_dimension_ == 0 else (bounds[0], b) 256 | 257 | 258 | def compute_child_bounds_1d_perpendicular(bounds, parent, lower): 259 | b = bounds 260 | b = (b[0], min(b[1], parent.split_value_)) if lower else (max(b[0], parent.split_value_), b[1]) 261 | return b 262 | 263 | 264 | def draw_node_1d_perpendicular(node, bounds): 265 | if node.is_leaf(): 266 | x0 = bounds[0] 267 | x1 = bounds[1] 268 | 269 | mean = node._compute_posterior_mean() 270 | # alpha = np.abs(mean-0.5) 271 | # alpha = max(0.1, alpha) # make sure very faint colors become visibly colored 272 | # color = color0 if mean < 0.5 else color1 273 | plt.plot([x0, x1], [mean, mean], 'r') 274 | else: 275 | draw_node_1d_perpendicular(node.child1_, compute_child_bounds_1d_perpendicular(bounds, node, True)) 276 | draw_node_1d_perpendicular(node.child2_, compute_child_bounds_1d_perpendicular(bounds, node, False)) 277 | 278 | 279 | class Line: 280 | def __init__(self, p0, p1): 281 | if p0[0] > p1[0]: 282 | p1, p0 = p0, p1 283 | 284 | self.p0 = np.asarray(p0) 285 | self.p1 = np.asarray(p1) 286 | 287 | def intersect(self, other): 288 | da = self.p1-self.p0 289 | ma = da[1]/da[0] 290 | 291 | db = other.p1-other.p0 292 | mb = db[1]/db[0] 293 | 294 | x0a = self.p0[0] 295 | x1a = self.p1[0] 296 | x0b = other.p0[0] 297 | x1b = other.p1[0] 298 | y0a = self.p0[1] 299 | y0b = other.p0[1] 300 | 301 | x = (y0a-y0b + mb*x0b-ma*x0a) / (mb-ma) 302 | y = y0a + ma*(x-x0a) 303 | 304 | if x0a <= x <= x1a and x0b <= x <= x1b: 305 | return np.array([x, y]) 306 | else: 307 | return None 308 | 309 | def plot(self, *args, **kwargs): 310 | plt.plot([self.p0[0], self.p1[0]], [self.p0[1], self.p1[1]], *args, **kwargs) 311 | 312 | def __str__(self): 313 | return f'{self.p0} -> {self.p1}' 314 | 315 | 316 | @dataclass 317 | class Parent: 318 | line: Line 319 | origin: np.ndarray 320 | normal: np.ndarray 321 | side: str 322 | 323 | 324 | # plots the root node split and all child nodes recursively 325 | def plot_root(root, X, y, title, cmap): 326 | plt.title(title) 327 | 328 | plt.plot(X[y == 0, 0], X[y == 0, 1], 'b.', ms=3) 329 | plt.plot(X[y == 1, 0], X[y == 1, 1], 'r.', ms=3) 330 | 331 | x_min = X[:, 0].min() 332 | x_max = X[:, 0].max() 333 | y_min = X[:, 1].min() 334 | y_max = X[:, 1].max() 335 | 336 | top = Line([x_min, y_max], [x_max, y_max]) 337 | bottom = Line([x_min, y_min], [x_max, y_min]) 338 | 339 | def plot_node(node, node_vs_color={}, level=0, parents=[], side=None): 340 | if node.best_hyperplane_origin_ is None: 341 | return 342 | 343 | # pick an arbitrary origin and get the normal 344 | origin = node.best_hyperplane_origin_ 345 | normal = node.best_hyperplane_normal_ 346 | 347 | # construct line segment 348 | m = -normal[0]/normal[1] 349 | y0 = origin[1] + m*(x_min-origin[0]) 350 | y1 = origin[1] + m*(x_max-origin[0]) 351 | 352 | # raw line without intersections 353 | line = Line([x_min, y0], [x_max, y1]) 354 | 355 | # intersect with parents 356 | for parent in parents: 357 | p = line.intersect(parent.line) 358 | if p is not None: 359 | # determine side of line to keep 360 | activation0 = np.dot(line.p0 - parent.origin, parent.normal) 361 | 362 | if (parent.side == 'L' and activation0 > 0) or (parent.side == 'R' and activation0 < 0): 363 | line = Line(line.p0, p) 364 | else: 365 | line = Line(p, line.p1) 366 | 367 | # intersect with top/bottom 368 | p = line.intersect(top) 369 | if p is not None: 370 | if y0 > y_max: 371 | line = Line(p, line.p1) 372 | else: 373 | line = Line(line.p0, p) 374 | 375 | p = line.intersect(bottom) 376 | if p is not None: 377 | if y0 < y_min: 378 | line = Line(p, line.p1) 379 | else: 380 | line = Line(line.p0, p) 381 | 382 | # generate line name 383 | if side is not None: 384 | side_name = ' - '.join(f'{parents[i].side}{level-len(parents)+i+1}' for i in range(len(parents))) 385 | else: 386 | side_name = '' 387 | 388 | side_name = 'Root' if len(side_name) == 0 else 'Root - ' + side_name 389 | 390 | # make sure node colors don't change 391 | if id(node) not in node_vs_color: 392 | color = cmap(len(node_vs_color)) 393 | node_vs_color[id(node)] = color 394 | else: 395 | color = node_vs_color[id(node)] 396 | 397 | # compute line width as a function of the stiffness 398 | stiffness = np.linalg.norm(normal) 399 | lw = 2 # 100/stiffness 400 | 401 | line.plot(color=color, label=side_name, lw=lw, alpha=0.7) 402 | 403 | if node.child1_: 404 | plot_node(node.child1_, node_vs_color, level+1, parents=parents + [Parent(line, origin, normal, 'L')], side='L') 405 | 406 | if node.child2_: 407 | plot_node(node.child2_, node_vs_color, level+1, parents=parents + [Parent(line, origin, normal, 'R')], side='R') 408 | 409 | plot_node(root) 410 | 411 | 412 | def plot_2d_hyperplane(root, X_train, y_train, info_train, X_test, y_test, info_test): 413 | plt.figure(figsize=[10, 16], dpi=75) 414 | 415 | n_classes = int(y_train.max()) + 1 416 | colormap = plt.get_cmap('gist_rainbow') 417 | 418 | x_min = min(X_train[:, 0].min(), X_test[:, 0].min()) 419 | x_max = max(X_train[:, 0].max(), X_test[:, 0].max()) 420 | y_min = min(X_train[:, 1].min(), X_test[:, 1].min()) 421 | y_max = max(X_train[:, 1].max(), X_test[:, 1].max()) 422 | 423 | def plot(X, y, info): 424 | for i in range(n_classes): 425 | class_i = y == i 426 | plt.plot(X[np.where(class_i)[0], 0], 427 | X[np.where(class_i)[0], 1], 428 | 'o', 429 | ms=4, 430 | c=colormap(i/n_classes), 431 | label='Class {}'.format(i)) 432 | 433 | plot_root(root, X, y, info, plt.get_cmap('tab20')) 434 | 435 | plt.title(info) 436 | plt.xlabel('x0') 437 | plt.ylabel('x1') 438 | plt.legend() 439 | 440 | plt.subplot(211) 441 | plot(X_train, y_train, info_train) 442 | plt.xlim((x_min, x_max)) 443 | plt.ylim((y_min, y_max)) 444 | plt.gca().set_aspect(1) 445 | 446 | plt.subplot(212) 447 | plot(X_test, y_test, info_test) 448 | plt.xlim((x_min, x_max)) 449 | plt.ylim((y_min, y_max)) 450 | plt.gca().set_aspect(1) 451 | 452 | plt.show() 453 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy-darts.hadoop_utils._version] 2 | ignore_errors=True 3 | [mypy-versioneer] 4 | ignore_errors=True 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | ignore = E122,E123,E126,E127,E128,E731,E722 4 | exclude = build,bayesian_decision_tree/_version.py,tests,conda.recipe,.git,versioneer.py,benchmarks,.asv 5 | 6 | [tool:pytest] 7 | norecursedirs= .* *.egg* build dist conda.recipe 8 | addopts = 9 | --junitxml=junit.xml 10 | --ignore setup.py 11 | --ignore run_test.py 12 | --cov-report term-missing 13 | --tb native 14 | --strict 15 | --durations=20 16 | # --mypy 17 | --flake8 18 | --cov=bayesian_decision_tree 19 | env = 20 | PYTHONHASHSEED=0 21 | markers = 22 | serial: execute test serially (to avoid race conditions) 23 | 24 | [versioneer] 25 | VCS = git 26 | style = pep440-pre 27 | versionfile_source = bayesian_decision_tree/_version.py 28 | versionfile_build = bayesian_decision_tree/_version.py 29 | tag_prefix = ver- 30 | parentdir_prefix = bayesian_decision_tree 31 | 32 | [bdist_wheel] 33 | universal=1 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2018-2019 UBS AG 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from setuptools import setup, find_packages 18 | from os import path 19 | import versioneer 20 | 21 | 22 | here = path.abspath(path.dirname(__file__)) 23 | with open(path.join(here, "README.md")) as l: 24 | long_description = l.read() 25 | 26 | requirements = [ 27 | 'matplotlib>=2.2.*', 28 | 'scipy>=1.2.*', 29 | 'numpy>=1.13.*', 30 | 'pandas>=0.23.*', 31 | 'requests==2.21.0', 32 | 'scikit-learn>=0.19.*', 33 | ] 34 | 35 | setup( 36 | name='bayesian-decision-tree', 37 | version=versioneer.get_version(), 38 | description='An implementation of the paper: A Bayesian Tree Algorithm by Nuti et al.', 39 | long_description=long_description, 40 | long_description_content_type="text/markdown", 41 | url='https://github.com/UBS-IB/bayesian_tree', 42 | author='UBS SDL Data Science', 43 | author_email='dl-frc-sdl-datascience@ubs.com', 44 | classifiers=[ 45 | 'Development Status :: 3 - Alpha', 46 | 'Intended Audience :: Developers', 47 | 'Natural Language :: English', 48 | 'Programming Language :: Python :: 3.5', 49 | 'License :: OSI Approved :: Apache License Version 2.0', 50 | "Operating System :: OS Independent", 51 | ], 52 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 53 | install_requires=requirements, 54 | ) 55 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UBS-IB/bayesian_tree/718aecc68e7ea527380b8e299b4f7d69e86f7400/tests/__init__.py -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UBS-IB/bayesian_tree/718aecc68e7ea527380b8e299b4f7d69e86f7400/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/helper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.optimize._differentialevolution import DifferentialEvolutionSolver 3 | from scipy.sparse import csc_matrix, csr_matrix 4 | 5 | from bayesian_decision_tree.classification import PerpendicularClassificationTree, HyperplaneClassificationTree 6 | from bayesian_decision_tree.hyperplane_optimization import ScipyOptimizer, RandomTwoPointOptimizer 7 | from bayesian_decision_tree.hyperplane_optimization import SimulatedAnnealingOptimizer, RandomHyperplaneOptimizer 8 | from bayesian_decision_tree.regression import PerpendicularRegressionTree, HyperplaneRegressionTree 9 | 10 | # possible data matrix types/transforms that need to work for fit() 11 | data_matrix_transforms = [ 12 | lambda X: X, 13 | lambda X: csc_matrix(X), 14 | lambda X: csr_matrix(X), 15 | lambda X: pd.DataFrame(data=X, columns=['col-{}'.format(i) for i in range(len(X[0]))]), 16 | ] 17 | 18 | 19 | # classification tree models in all flavours 20 | def create_classification_trees(prior, partition_prior, prune=False): 21 | return [ 22 | PerpendicularClassificationTree(partition_prior, prior, prune=prune), 23 | HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune), 24 | HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=ScipyOptimizer(DifferentialEvolutionSolver, 666)), 25 | HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=RandomTwoPointOptimizer(100, 666)), 26 | HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=RandomHyperplaneOptimizer(100, 666)), 27 | HyperplaneClassificationTree(partition_prior, prior, delta=0, prune=prune, optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666)), 28 | ] 29 | 30 | 31 | # regression tree models in all flavours 32 | def create_regression_trees(prior, partition_prior): 33 | return [ 34 | PerpendicularRegressionTree(partition_prior, prior), 35 | HyperplaneRegressionTree(partition_prior, prior), 36 | HyperplaneRegressionTree(partition_prior, prior, optimizer=ScipyOptimizer(DifferentialEvolutionSolver, 666)), 37 | HyperplaneRegressionTree(partition_prior, prior, optimizer=RandomHyperplaneOptimizer(100, 666)), 38 | HyperplaneRegressionTree(partition_prior, prior, optimizer=SimulatedAnnealingOptimizer(10, 10, 0.9, 666)), 39 | ] 40 | -------------------------------------------------------------------------------- /tests/unit/test_classification.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from numpy.random import normal, randint 6 | from numpy.testing import assert_array_equal, assert_array_almost_equal 7 | 8 | from bayesian_decision_tree.classification import PerpendicularClassificationTree 9 | from tests.unit.helper import data_matrix_transforms, create_classification_trees 10 | 11 | 12 | class ClassificationTreeTest(TestCase): 13 | def test_cannot_fit_with_bad_dimensions(self): 14 | np.random.seed(6666) 15 | for good_X in [normal(0, 1, [10, 10])]: 16 | for bad_y in [randint(0, 2, []), randint(0, 2, [10, 10]), randint(0, 2, [11]), randint(0, 2, [10, 10, 10])]: 17 | for model in create_classification_trees(np.array([1, 1]), 0.5): 18 | try: 19 | model.fit(good_X, bad_y) 20 | self.fail() 21 | except ValueError: 22 | pass 23 | 24 | for bad_X in [normal(0, 1, [10, 10, 10])]: 25 | for good_y in [randint(0, 2, [10])]: 26 | for model in create_classification_trees(np.array([1, 1]), 0.5): 27 | try: 28 | model.fit(bad_X, good_y) 29 | self.fail() 30 | except ValueError: 31 | pass 32 | 33 | def test_cannot_predict_before_training(self): 34 | for model in create_classification_trees(np.array([1, 1]), 0.5): 35 | # can't predict yet 36 | try: 37 | model.predict([]) 38 | self.fail() 39 | except ValueError: 40 | pass 41 | 42 | # can't predict probability yet 43 | try: 44 | model.predict_proba([]) 45 | self.fail() 46 | except ValueError: 47 | pass 48 | 49 | def test_cannot_predict_with_bad_input_dimensions(self): 50 | for data_matrix_transform in data_matrix_transforms: 51 | for model in create_classification_trees(np.array([1, 1]), 0.5): 52 | Xy = np.array([ 53 | [0.0, 0.0, 0], 54 | [0.0, 1.0, 1], 55 | [1.0, 1.0, 0], 56 | [1.0, 0.0, 1], 57 | [1.0, 0.0, 0], 58 | ]) 59 | X = Xy[:, :-1] 60 | y = Xy[:, -1] 61 | 62 | X = data_matrix_transform(X) 63 | 64 | print('Testing {}'.format(type(model).__name__)) 65 | model.fit(X, y) 66 | print(model) 67 | 68 | model.predict([[0, 0]]) 69 | 70 | try: 71 | model.predict(0) 72 | self.fail() 73 | except ValueError: 74 | pass 75 | 76 | try: 77 | model.predict([0]) 78 | self.fail() 79 | except ValueError: 80 | pass 81 | 82 | try: 83 | model.predict([0, 0, 0]) 84 | self.fail() 85 | except ValueError: 86 | pass 87 | 88 | def test_print_empty_model(self): 89 | for model in create_classification_trees(np.array([1, 1]), 0.5): 90 | print(model) 91 | 92 | def test_no_split(self): 93 | for data_matrix_transform in data_matrix_transforms: 94 | for model in create_classification_trees(np.array([1, 1]), 0.5): 95 | Xy = np.array([ 96 | [0.0, 0, 0], 97 | [0.0, 1, 1], 98 | [1.0, 2, 0], 99 | [1.0, 3, 1], 100 | [1.0, 4, 0], 101 | ]) 102 | X = Xy[:, :-1] 103 | y = Xy[:, -1] 104 | 105 | X = data_matrix_transform(X) 106 | 107 | print('Testing {}'.format(type(model).__name__)) 108 | model.fit(X, y) 109 | print(model) 110 | 111 | self.assertEqual(model.get_depth(), 0) 112 | self.assertEqual(model.get_n_leaves(), 1) 113 | self.assertEqual(model.n_data_, 5) 114 | 115 | self.assertIsNone(model.child1_) 116 | self.assertIsNone(model.child2_) 117 | 118 | if isinstance(model, PerpendicularClassificationTree): 119 | self.assertEqual(model.split_dimension_, -1) 120 | self.assertEqual(model.split_value_, None) 121 | else: 122 | self.assertEqual(model.best_hyperplane_origin_, None) 123 | self.assertEqual(model.best_hyperplane_normal_, None) 124 | 125 | expected = np.array([0, 0, 0, 0]) 126 | self.assertEqual(model.predict([[0, 0]]), expected[0]) 127 | self.assertEqual(model.predict([[0, 1]]), expected[1]) 128 | self.assertEqual(model.predict([[1, 0]]), expected[2]) 129 | self.assertEqual(model.predict([[1, 1]]), expected[3]) 130 | 131 | for data_matrix_transform2 in data_matrix_transforms: 132 | assert_array_equal(model.predict(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 1]])), expected) 133 | 134 | expected = np.array([[4/7, 3/7], [4/7, 3/7], [4/7, 3/7], [4/7, 3/7], ]) 135 | assert_array_almost_equal(model.predict_proba([[0, 0]]), np.expand_dims(expected[0], 0)) 136 | assert_array_almost_equal(model.predict_proba([[0, 1]]), np.expand_dims(expected[1], 0)) 137 | assert_array_almost_equal(model.predict_proba([[1, 0]]), np.expand_dims(expected[2], 0)) 138 | assert_array_almost_equal(model.predict_proba([[1, 1]]), np.expand_dims(expected[3], 0)) 139 | 140 | for data_matrix_transform2 in data_matrix_transforms: 141 | assert_array_almost_equal(model.predict_proba(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 1]])), expected) 142 | 143 | if isinstance(model, PerpendicularClassificationTree): 144 | # TODO: also add for hyperplane version 145 | expected_paths = [ 146 | [], 147 | [], 148 | [], 149 | [], 150 | ] 151 | self.assertEqual(model.prediction_paths([[0, 0]]), [expected_paths[0]]) 152 | self.assertEqual(model.prediction_paths([[0, 1]]), [expected_paths[1]]) 153 | self.assertEqual(model.prediction_paths([[1, 0]]), [expected_paths[2]]) 154 | self.assertEqual(model.prediction_paths([[1, 1]]), [expected_paths[3]]) 155 | 156 | for data_matrix_transform2 in data_matrix_transforms: 157 | self.assertEqual(model.prediction_paths(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 1]])), expected_paths) 158 | 159 | def test_one_split(self): 160 | for data_matrix_transform in data_matrix_transforms: 161 | for model in create_classification_trees(np.array([1, 1]), 0.7): 162 | Xy = np.array([ 163 | [0.0, 0, 0], 164 | [0.1, 1, 0], 165 | 166 | [0.9, 0, 1], 167 | [1.0, 1, 1], 168 | ]) 169 | X = Xy[:, :-1] 170 | y = Xy[:, -1] 171 | 172 | X = data_matrix_transform(X) 173 | 174 | print('Testing {}'.format(type(model).__name__)) 175 | model.fit(X, y) 176 | print(model) 177 | 178 | self.assertEqual(model.get_depth(), 1) 179 | self.assertEqual(model.get_n_leaves(), 2) 180 | self.assertEqual(model.n_data_, 4) 181 | 182 | self.assertIsNotNone(model.child1_) 183 | self.assertIsNone(model.child1_.child1_) 184 | self.assertIsNone(model.child1_.child2_) 185 | self.assertEqual(model.child1_.n_data_, 2) 186 | 187 | self.assertIsNotNone(model.child2_) 188 | self.assertIsNone(model.child2_.child1_) 189 | self.assertIsNone(model.child2_.child2_) 190 | self.assertEqual(model.child1_.n_data_, 2) 191 | 192 | if isinstance(model, PerpendicularClassificationTree): 193 | self.assertEqual(model.split_dimension_, 0) 194 | self.assertEqual(model.split_value_, 0.5) 195 | else: 196 | self.assertTrue(0.1 < model.best_hyperplane_origin_[0] < 0.9) 197 | 198 | expected = np.array([0, 0, 1, 1]) 199 | self.assertEqual(model.predict([[0, 0]]), expected[0]) 200 | self.assertEqual(model.predict([[0, 1]]), expected[1]) 201 | self.assertEqual(model.predict([[1, 0]]), expected[2]) 202 | self.assertEqual(model.predict([[1, 1]]), expected[3]) 203 | 204 | for data_matrix_transform2 in data_matrix_transforms: 205 | assert_array_equal(model.predict(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 0]])), expected) 206 | 207 | expected = np.array([[3/4, 1/4], [3/4, 1/4], [1/4, 3/4], [1/4, 3/4]]) 208 | assert_array_almost_equal(model.predict_proba([[0, 0]]), np.expand_dims(expected[0], 0)) 209 | assert_array_almost_equal(model.predict_proba([[0, 1]]), np.expand_dims(expected[1], 0)) 210 | assert_array_almost_equal(model.predict_proba([[1, 0]]), np.expand_dims(expected[2], 0)) 211 | assert_array_almost_equal(model.predict_proba([[1, 1]]), np.expand_dims(expected[3], 0)) 212 | 213 | for data_matrix_transform2 in data_matrix_transforms: 214 | assert_array_almost_equal(model.predict_proba(data_matrix_transform2([[0, 0], [0, 1], [1, 0], [1, 0]])), expected) 215 | 216 | def test_two_splits(self): 217 | for data_matrix_transform in data_matrix_transforms: 218 | for model in create_classification_trees(np.array([1, 1]), 0.9, prune=True): 219 | Xy = np.array([ 220 | [0.0, 0.0, 0], 221 | [0.1, 1.0, 0], 222 | [0.2, 0.01, 0], 223 | [0.3, 0.99, 0], 224 | 225 | [0.7, 0.02, 1], 226 | [0.8, 0.98, 1], 227 | [0.9, 0.03, 1], 228 | [1.0, 0.97, 1], 229 | 230 | [2.0, 0.04, 0], 231 | [2.1, 0.96, 0], 232 | ]) 233 | X = Xy[:, :-1] 234 | y = Xy[:, -1] 235 | 236 | X = data_matrix_transform(X) 237 | 238 | print('Testing {}'.format(type(model).__name__)) 239 | model.fit(X, y) 240 | print(model) 241 | 242 | if isinstance(model, PerpendicularClassificationTree): 243 | self.assertEqual(model.get_depth(), 2) 244 | self.assertEqual(model.get_n_leaves(), 3) 245 | self.assertEqual(model.n_data_, 10) 246 | 247 | self.assertIsNotNone(model.child1_) 248 | self.assertEqual(model.child1_.n_data_, 4) 249 | self.assertIsNone(model.child1_.child1_) 250 | self.assertIsNone(model.child1_.child2_) 251 | 252 | self.assertIsNotNone(model.child2_) 253 | self.assertEqual(model.child2_.n_data_, 6) 254 | self.assertIsNotNone(model.child2_.child1_) 255 | self.assertEqual(model.child2_.child1_.n_data_, 4) 256 | self.assertIsNotNone(model.child2_.child2_) 257 | self.assertEqual(model.child2_.child2_.n_data_, 2) 258 | 259 | self.assertIsNone(model.child2_.child1_.child1_) 260 | self.assertIsNone(model.child2_.child1_.child2_) 261 | self.assertIsNone(model.child2_.child2_.child1_) 262 | self.assertIsNone(model.child2_.child2_.child2_) 263 | 264 | self.assertEqual(model.split_dimension_, 0) 265 | self.assertEqual(model.split_value_, 0.5) 266 | 267 | self.assertEqual(model.child2_.split_dimension_, 0) 268 | self.assertEqual(model.child2_.split_value_, 1.5) 269 | else: 270 | self.assertEqual(model.get_depth(), 2) 271 | self.assertEqual(model.get_n_leaves(), 3) 272 | self.assertEqual(model.n_data_, 10) 273 | 274 | self.assertTrue(0.3 < model.best_hyperplane_origin_[0] < 0.7) 275 | if model.child1_.best_hyperplane_origin_ is not None: 276 | self.assertTrue(1.0 < model.child1_.best_hyperplane_origin_[0] < 2.0) 277 | self.assertEqual(model.child1_.n_data_, 6) 278 | self.assertEqual(model.child2_.n_data_, 4) 279 | else: 280 | self.assertTrue(1.0 < model.child2_.best_hyperplane_origin_[0] < 2.0) 281 | self.assertEqual(model.child1_.n_data_, 4) 282 | self.assertEqual(model.child2_.n_data_, 6) 283 | 284 | expected = np.array([0, 0, 1, 1, 0, 0]) 285 | self.assertEqual(model.predict([[0, 0.5]]), expected[0]) 286 | self.assertEqual(model.predict([[0.4, 0.5]]), expected[1]) 287 | self.assertEqual(model.predict([[0.6, 0.5]]), expected[2]) 288 | self.assertEqual(model.predict([[1.4, 0.5]]), expected[3]) 289 | self.assertEqual(model.predict([[1.6, 0.5]]), expected[4]) 290 | self.assertEqual(model.predict([[100, 0.5]]), expected[5]) 291 | 292 | for data_matrix_transform2 in data_matrix_transforms: 293 | assert_array_equal(model.predict(data_matrix_transform2( 294 | [[0.0, 0.5], [0.4, 0.5], [0.6, 0.5], [1.4, 0.5], [1.6, 0.5], [100, 0.5]]) 295 | ), expected) 296 | 297 | expected = np.array([[5/6, 1/6], [5/6, 1/6], [1/6, 5/6], [1/6, 5/6], [3/4, 1/4], [3/4, 1/4]]) 298 | assert_array_almost_equal(model.predict_proba([[0, 0.5]]), np.expand_dims(expected[0], 0)) 299 | assert_array_almost_equal(model.predict_proba([[0.4, 0.5]]), np.expand_dims(expected[1], 0)) 300 | assert_array_almost_equal(model.predict_proba([[0.6, 0.5]]), np.expand_dims(expected[2], 0)) 301 | assert_array_almost_equal(model.predict_proba([[1.4, 0.5]]), np.expand_dims(expected[3], 0)) 302 | assert_array_almost_equal(model.predict_proba([[1.6, 0.5]]), np.expand_dims(expected[4], 0)) 303 | assert_array_almost_equal(model.predict_proba([[100, 0.5]]), np.expand_dims(expected[5], 0)) 304 | 305 | for data_matrix_transform2 in data_matrix_transforms: 306 | assert_array_equal(model.predict_proba(data_matrix_transform2( 307 | [[0.0, 0.5], [0.4, 0.5], [0.6, 0.5], [1.4, 0.5], [1.6, 0.5], [100, 0.5]]) 308 | ), expected) 309 | 310 | if isinstance(model, PerpendicularClassificationTree): 311 | # TODO: also add for hyperplane version 312 | feature_names = X.columns if isinstance(X, pd.DataFrame) else ['x{}'.format(i) for i in range(X.shape[1])] 313 | expected_paths = [ 314 | [(0, feature_names[0], 0.5, False)], 315 | [(0, feature_names[0], 0.5, False)], 316 | [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, False)], 317 | [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, False)], 318 | [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, True)], 319 | [(0, feature_names[0], 0.5, True), (0, feature_names[0], 1.5, True)], 320 | ] 321 | self.assertEqual(model.prediction_paths([[0, 0.5]]), [expected_paths[0]]) 322 | self.assertEqual(model.prediction_paths([[0.4, 0.5]]), [expected_paths[1]]) 323 | self.assertEqual(model.prediction_paths([[0.6, 0.5]]), [expected_paths[2]]) 324 | self.assertEqual(model.prediction_paths([[1.4, 0.5]]), [expected_paths[3]]) 325 | self.assertEqual(model.prediction_paths([[1.6, 0.5]]), [expected_paths[4]]) 326 | self.assertEqual(model.prediction_paths([[100, 0.5]]), [expected_paths[5]]) 327 | 328 | for data_matrix_transform2 in data_matrix_transforms: 329 | self.assertEqual(model.prediction_paths(data_matrix_transform2( 330 | [[0.0, 0.5], [0.4, 0.5], [0.6, 0.5], [1.4, 0.5], [1.6, 0.5], [100, 0.5]]) 331 | ), expected_paths) 332 | 333 | def test_prune(self): 334 | for model_no_prune, model_prune in zip( 335 | create_classification_trees(np.array([10, 10]), 0.9, prune=False), 336 | create_classification_trees(np.array([10, 10]), 0.9, prune=True)): 337 | np.random.seed(666) 338 | 339 | X = np.vstack([ 340 | normal(0, 1, [100, 2]), 341 | normal(10, 1, [100, 2]), 342 | normal(14, 1, [100, 2]), 343 | ]) 344 | y = np.hstack([ 345 | 0 * np.ones(100), 346 | 1 * np.ones(100), 347 | np.minimum(1, randint(0, 3, 100)), # about two thirds should be 1's 348 | ]) 349 | 350 | # make sure model_no_prune finds two splits at 5 and 12 and that model_prune 351 | # only finds one (because everything >= 5 has target 1) 352 | model_no_prune.fit(X, y) 353 | model_prune.fit(X, y) 354 | self.assertEqual(model_no_prune.get_depth(), 2) 355 | self.assertEqual(model_no_prune.get_n_leaves(), 3) 356 | self.assertEqual(model_prune.get_depth(), 1) 357 | self.assertEqual(model_prune.get_n_leaves(), 2) 358 | 359 | # now make sure the node that is the result of pruning two children is consistent 360 | c1 = model_no_prune.child2_.child1_ 361 | c2 = model_no_prune.child2_.child2_ 362 | c12 = model_prune.child2_ 363 | assert_array_equal(c12.posterior_, c1.posterior_ + c2.posterior_ - c12.prior) 364 | 365 | def test_feature_importance_consistency_when_mirroring_along_axes(self): 366 | np.random.seed(42) 367 | 368 | n = 200 369 | X0 = np.zeros((n, 2)) 370 | sd = 3 371 | X0[0*n//4:1*n//4] = np.random.normal([2, 2], sd, (n//4, 2)) 372 | X0[1*n//4:2*n//4] = np.random.normal([-2, 1], sd, (n//4, 2)) 373 | X0[2*n//4:3*n//4] = np.random.normal([-2, -1], sd, (n//4, 2)) 374 | X0[3*n//4:4*n//4] = np.random.normal([-2, -2], sd, (n//4, 2)) 375 | 376 | y = np.zeros(n) 377 | y[0*n//4:1*n//4] = 1 378 | y[2*n//4:3*n//4] = 1 379 | 380 | for m1, m2, m3, m4 in zip( 381 | create_classification_trees(np.array([1, 1]), 0.99, prune=True), 382 | create_classification_trees(np.array([1, 1]), 0.99, prune=True), 383 | create_classification_trees(np.array([1, 1]), 0.99, prune=True), 384 | create_classification_trees(np.array([1, 1]), 0.99, prune=True)): 385 | 386 | X1 = np.vstack((+X0[:, 0], +X0[:, 1])).T 387 | X2 = np.vstack((+X0[:, 0], -X0[:, 1])).T 388 | X3 = np.vstack((-X0[:, 0], +X0[:, 1])).T 389 | X4 = np.vstack((-X0[:, 0], -X0[:, 1])).T 390 | 391 | print('Testing {}'.format(type(m1).__name__)) 392 | 393 | m1.fit(X1, y) 394 | m2.fit(X2, y) 395 | m3.fit(X3, y) 396 | m4.fit(X4, y) 397 | 398 | fi1 = m1.feature_importance() 399 | fi2 = m2.feature_importance() 400 | fi3 = m3.feature_importance() 401 | fi4 = m4.feature_importance() 402 | 403 | self.assertTrue(np.all(fi1 != 0)) 404 | assert_array_almost_equal(fi1, fi2, decimal=1) 405 | assert_array_almost_equal(fi1, fi3, decimal=1) 406 | assert_array_almost_equal(fi1, fi4, decimal=1) 407 | assert_array_almost_equal(fi2, fi3, decimal=1) 408 | assert_array_almost_equal(fi2, fi4, decimal=1) 409 | assert_array_almost_equal(fi3, fi4, decimal=1) 410 | -------------------------------------------------------------------------------- /tests/unit/test_regression.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import numpy as np 4 | from numpy.testing import assert_array_equal 5 | from sklearn.metrics import mean_squared_error 6 | 7 | from bayesian_decision_tree.regression import PerpendicularRegressionTree 8 | from tests.unit.helper import data_matrix_transforms, create_regression_trees 9 | 10 | 11 | class RegressionTreeTest(TestCase): 12 | def test_cannot_predict_before_training(self): 13 | mu = 0 14 | sd_prior = 1 15 | prior_obs = 0.01 16 | kappa = prior_obs 17 | alpha = prior_obs/2 18 | var_prior = sd_prior**2 19 | tau_prior = 1/var_prior 20 | beta = alpha/tau_prior 21 | 22 | prior = np.array([mu, kappa, alpha, beta]) 23 | 24 | for model in create_regression_trees(prior, 0.5): 25 | # can't predict yet 26 | try: 27 | model.predict([]) 28 | self.fail() 29 | except ValueError: 30 | pass 31 | 32 | def test_cannot_predict_with_bad_input_dimensions(self): 33 | mu = 0 34 | sd_prior = 1 35 | prior_obs = 0.01 36 | kappa = prior_obs 37 | alpha = prior_obs/2 38 | var_prior = sd_prior**2 39 | tau_prior = 1/var_prior 40 | beta = alpha/tau_prior 41 | 42 | prior = np.array([mu, kappa, alpha, beta]) 43 | 44 | for data_matrix_transform in data_matrix_transforms: 45 | for model in create_regression_trees(prior, 0.5): 46 | Xy = np.array([ 47 | [0.0, 0.0, 0.1], 48 | [0.0, 1.0, 1.0], 49 | [1.0, 1.0, 0.1], 50 | [1.0, 0.0, 1.0], 51 | [1.0, 0.0, 0.1], 52 | ]) 53 | X = Xy[:, :-1] 54 | y = Xy[:, -1] 55 | 56 | X = data_matrix_transform(X) 57 | 58 | print('Testing {}'.format(type(model).__name__)) 59 | model.fit(X, y) 60 | print(model) 61 | 62 | model.predict([[0, 0]]) 63 | 64 | try: 65 | model.predict(0) 66 | self.fail() 67 | except ValueError: 68 | pass 69 | 70 | try: 71 | model.predict([0]) 72 | self.fail() 73 | except ValueError: 74 | pass 75 | 76 | try: 77 | model.predict([0, 0, 0]) 78 | self.fail() 79 | except ValueError: 80 | pass 81 | 82 | def test_print_empty_model(self): 83 | for model in create_regression_trees(np.array([1, 1]), 0.5): 84 | print(model) 85 | 86 | def test_no_split(self): 87 | for data_matrix_transform in data_matrix_transforms: 88 | mu = 0 89 | sd_prior = 1 90 | prior_obs = 0.01 91 | kappa = prior_obs 92 | alpha = prior_obs/2 93 | var_prior = sd_prior**2 94 | tau_prior = 1/var_prior 95 | beta = alpha/tau_prior 96 | 97 | prior = np.array([mu, kappa, alpha, beta]) 98 | 99 | for model in create_regression_trees(prior, 0.5): 100 | Xy = np.array([ 101 | [0.0, 0.0, 0], 102 | [0.1, 0.1, 1.3], 103 | [0.9, 0.9, 0], 104 | [1.0, 1.0, 1.2], 105 | [1.0, 1.0, 0], 106 | ]) 107 | X = Xy[:, :-1] 108 | y = Xy[:, -1] 109 | 110 | X = data_matrix_transform(X) 111 | 112 | print('Testing {}'.format(type(model).__name__)) 113 | model.fit(X, y) 114 | print(model) 115 | 116 | self.assertEqual(model.get_depth(), 0) 117 | self.assertEqual(model.get_n_leaves(), 1) 118 | self.assertEqual(model.n_data_, 5) 119 | 120 | self.assertIsNone(model.child1_) 121 | self.assertIsNone(model.child2_) 122 | 123 | if isinstance(model, PerpendicularRegressionTree): 124 | self.assertEqual(model.split_dimension_, -1) 125 | self.assertEqual(model.split_value_, None) 126 | else: 127 | self.assertEqual(model.best_hyperplane_origin_, None) 128 | 129 | n = len(y) 130 | mean = y.mean() 131 | mu, kappa, alpha, beta = prior 132 | kappa_post = kappa + n 133 | mu_post = (kappa*mu + n*mean) / kappa_post 134 | 135 | expected = np.array([mu_post, mu_post, mu_post, mu_post]) 136 | self.assertEqual(model.predict([[0.0, 0.5]]), np.expand_dims(expected[0], 0)) 137 | self.assertEqual(model.predict([[0.49, 0.5]]), np.expand_dims(expected[1], 0)) 138 | self.assertEqual(model.predict([[0.51, 0.5]]), np.expand_dims(expected[2], 0)) 139 | self.assertEqual(model.predict([[1.0, 0.5]]), np.expand_dims(expected[3], 0)) 140 | 141 | for data_matrix_transform2 in data_matrix_transforms: 142 | assert_array_equal(model.predict(data_matrix_transform2([[0.0, 0.5], [0.49, 0.5], [0.51, 0.5], [1.0, 0.5]])), expected) 143 | 144 | def test_decreasing_mse_for_increased_partition_prior(self): 145 | for data_matrix_transform in data_matrix_transforms: 146 | mu = 0 147 | sd_prior = 1 148 | prior_obs = 0.01 149 | kappa = prior_obs 150 | alpha = prior_obs/2 151 | var_prior = sd_prior**2 152 | tau_prior = 1/var_prior 153 | beta = alpha/tau_prior 154 | 155 | prior = np.array([mu, kappa, alpha, beta]) 156 | 157 | x = np.linspace(-np.pi/2, np.pi/2, 20) 158 | y = np.linspace(-np.pi/2, np.pi/2, 20) 159 | X = np.array([x, y]).T 160 | y = np.sin(x) + 3*np.cos(y) 161 | 162 | X = data_matrix_transform(X) 163 | 164 | for i_model in range(len(create_regression_trees(prior, 0.5))): 165 | mse_list = [] 166 | for partition_prior in [0.1, 0.5, 0.9, 0.99]: 167 | model = create_regression_trees(prior, partition_prior)[i_model] 168 | print('Testing {}'.format(type(model).__name__)) 169 | model.fit(X, y) 170 | print(model) 171 | mse = mean_squared_error(y, model.predict(X)) 172 | mse_list.append(mse) 173 | 174 | self.assertTrue(mse_list[-1] < mse_list[0]) 175 | for i in range(0, len(mse_list)-1): 176 | self.assertTrue(mse_list[i+1] <= mse_list[i]) 177 | -------------------------------------------------------------------------------- /tests/unit/test_utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from unittest import TestCase 3 | 4 | import numpy as np 5 | from numpy.testing import assert_almost_equal 6 | 7 | from bayesian_decision_tree.utils import hypercube_to_hypersphere_surface 8 | 9 | 10 | class UtilsTest(TestCase): 11 | def test_hypercube_to_hypersphere_surface_2D_full_single_point(self): 12 | hc = np.array([0.2, 0.9]) 13 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False) 14 | 15 | # check dimensionality and norms 16 | self.assertEqual(hs.ndim, 1) 17 | self.assertEqual(hs.shape, (3,)) 18 | assert_almost_equal(np.linalg.norm(hs), 1) 19 | 20 | def test_hypercube_to_hypersphere_surface_1D_full(self): 21 | n_points = 11 22 | hc = np.linspace(0, 1, n_points).reshape(-1, 1) 23 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False) 24 | 25 | # check dimensionality and norms 26 | self.assertEqual(hs.ndim, 2) 27 | self.assertEqual(hs.shape, (n_points, 2)) 28 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 29 | 30 | # check uniformity 31 | expected_cos = np.dot(hs[0], hs[1]) 32 | for i in range(1, n_points): 33 | cos = np.dot(hs[i-1], hs[i]) 34 | assert_almost_equal(cos, expected_cos) 35 | 36 | cos = np.dot(hs[0], hs[-2]) 37 | assert_almost_equal(cos, expected_cos) 38 | 39 | cos = np.dot(hs[0], hs[-1]) 40 | assert_almost_equal(cos, 1.0) 41 | 42 | def test_hypercube_to_hypersphere_surface_1D_half(self): 43 | n_points = 11 44 | hc = np.linspace(0, 1, n_points).reshape(-1, 1) 45 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True) 46 | 47 | # check dimensionality and norms 48 | self.assertEqual(hs.ndim, 2) 49 | self.assertEqual(hs.shape, (n_points, 2)) 50 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 51 | 52 | # check uniformity 53 | expected_cos = np.dot(hs[0], hs[1]) 54 | for i in range(1, n_points): 55 | cos = np.dot(hs[i-1], hs[i]) 56 | assert_almost_equal(cos, expected_cos) 57 | 58 | cos = np.dot(hs[0], hs[-2]) 59 | assert_almost_equal(cos, -expected_cos) 60 | 61 | cos = np.dot(hs[0], hs[-1]) 62 | assert_almost_equal(cos, -1.0) 63 | 64 | def test_hypercube_to_hypersphere_surface_2D_full(self): 65 | n_points_per_dim = 1000 66 | n_points = n_points_per_dim**2 67 | grid = np.linspace(0, 1, n_points_per_dim) 68 | x, y = np.meshgrid(grid, grid) 69 | hc = np.array([x.flatten(), y.flatten()]).T 70 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False) 71 | 72 | # check dimensionality and norms 73 | self.assertEqual(hs.ndim, 2) 74 | self.assertEqual(hs.shape, (n_points, 2+1)) 75 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 76 | 77 | # make sure all quadrants contain approximately the same number of data points 78 | tolerance_fraction = 0.01 79 | for quadrant_signs in itertools.product([-1, 1], [-1, 1], [-1, 1]): 80 | in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum() 81 | min = n_points / 2**(2+1) * (1-tolerance_fraction) 82 | max = n_points / 2**(2+1) * (1+tolerance_fraction) 83 | msg = f'Expected a value between {min:.0f} and {max:.0f}, but was {in_quadrant}' 84 | self.assertTrue(min <= np.sum(in_quadrant) <= max, msg=msg) 85 | 86 | def test_hypercube_to_hypersphere_surface_2D_half(self): 87 | n_points_per_dim = 1000 88 | n_points = n_points_per_dim**2 89 | grid = np.linspace(0, 1, n_points_per_dim) 90 | x, y = np.meshgrid(grid, grid) 91 | hc = np.array([x.flatten(), y.flatten()]).T 92 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True) 93 | 94 | # check dimensionality and norms 95 | self.assertEqual(hs.ndim, 2) 96 | self.assertEqual(hs.shape, (n_points, 2+1)) 97 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 98 | 99 | # make sure all quadrants contain approximately the same number of data points 100 | tolerance_fraction = 0.01 101 | for quadrant_signs in itertools.product([-1, 1], [-1, 1], [-1, 1]): 102 | in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum() 103 | if quadrant_signs[0] == -1: 104 | self.assertEqual(np.sum(in_quadrant), 0) 105 | else: 106 | min = n_points / 2**2 * (1-tolerance_fraction) 107 | max = n_points / 2**2 * (1+tolerance_fraction) 108 | msg = f'Expected a value between {min:.0f} and {max:.0f} in quadrant {quadrant_signs}, but was {in_quadrant}' 109 | self.assertTrue(min <= np.sum(in_quadrant) <= max, msg) 110 | 111 | def test_hypercube_to_hypersphere_surface_5D_full(self): 112 | n_points = 1_000_000 113 | np.random.seed(666) 114 | hc = np.random.uniform(0, 1, (n_points, 5)) 115 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False) 116 | # hs = np.random.normal(0, 1, hs.shape) 117 | 118 | # check dimensionality and norms 119 | self.assertEqual(hs.ndim, 2) 120 | self.assertEqual(hs.shape, (n_points, 5+1)) 121 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 122 | 123 | # make sure all quadrants contain approximately the same number of data points 124 | tolerance_fraction = 0.02 125 | for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (5+1, 1)))): 126 | in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum() 127 | min = n_points / 2**(5+1) * (1-tolerance_fraction) 128 | max = n_points / 2**(5+1) * (1+tolerance_fraction) 129 | msg = f'Expected a value between {min:.0f} and {max:.0f}, but was {in_quadrant}' 130 | self.assertTrue(min <= np.sum(in_quadrant) <= max, msg=msg) 131 | 132 | def test_hypercube_to_hypersphere_surface_5D_half(self): 133 | n_points = 1_000_000 134 | np.random.seed(666) 135 | hc = np.random.uniform(0, 1, (n_points, 5)) 136 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True) 137 | 138 | # check dimensionality and norms 139 | self.assertEqual(hs.ndim, 2) 140 | self.assertEqual(hs.shape, (n_points, 5+1)) 141 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 142 | 143 | # make sure all quadrants contain approximately the same number of data points 144 | tolerance_fraction = 0.01 145 | for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (5+1, 1)))): 146 | in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum() 147 | if quadrant_signs[0] == -1: 148 | self.assertEqual(np.sum(in_quadrant), 0) 149 | else: 150 | min = n_points / 2**5 * (1-tolerance_fraction) 151 | max = n_points / 2**5 * (1+tolerance_fraction) 152 | msg = f'Expected a value between {min:.0f} and {max:.0f} in quadrant {quadrant_signs}, but was {in_quadrant}' 153 | self.assertTrue(min <= np.sum(in_quadrant) <= max, msg) 154 | 155 | def test_hypercube_to_hypersphere_surface_6D_full(self): 156 | n_points = 1_000_000 157 | np.random.seed(666) 158 | hc = np.random.uniform(0, 1, (n_points, 6)) 159 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=False) 160 | # hs = np.random.normal(0, 1, hs.shape) 161 | 162 | # check dimensionality and norms 163 | self.assertEqual(hs.ndim, 2) 164 | self.assertEqual(hs.shape, (n_points, 6+1)) 165 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 166 | 167 | # make sure all quadrants contain approximately the same number of data points 168 | tolerance_fraction = 0.03 169 | for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (6+1, 1)))): 170 | in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum() 171 | min = n_points / 2**(6+1) * (1-tolerance_fraction) 172 | max = n_points / 2**(6+1) * (1+tolerance_fraction) 173 | msg = f'Expected a value between {min:.0f} and {max:.0f}, but was {in_quadrant}' 174 | self.assertTrue(min <= np.sum(in_quadrant) <= max, msg=msg) 175 | 176 | def test_hypercube_to_hypersphere_surface_6D_half(self): 177 | n_points = 1_000_000 178 | np.random.seed(666) 179 | hc = np.random.uniform(0, 1, (n_points, 6)) 180 | hs = hypercube_to_hypersphere_surface(hc, half_hypersphere=True) 181 | 182 | # check dimensionality and norms 183 | self.assertEqual(hs.ndim, 2) 184 | self.assertEqual(hs.shape, (n_points, 6+1)) 185 | assert_almost_equal(np.linalg.norm(hs, axis=1), 1) 186 | 187 | # make sure all quadrants contain approximately the same number of data points 188 | tolerance_fraction = 0.03 189 | for quadrant_signs in itertools.product(*list(np.tile([-1, 1], (6+1, 1)))): 190 | in_quadrant = np.all(hs * quadrant_signs > 0, axis=1).sum() 191 | if quadrant_signs[0] == -1: 192 | self.assertEqual(np.sum(in_quadrant), 0) 193 | else: 194 | min = n_points / 2**6 * (1-tolerance_fraction) 195 | max = n_points / 2**6 * (1+tolerance_fraction) 196 | msg = f'Expected a value between {min:.0f} and {max:.0f} in quadrant {quadrant_signs}, but was {in_quadrant}' 197 | self.assertTrue(min <= np.sum(in_quadrant) <= max, msg) 198 | --------------------------------------------------------------------------------