├── .gitignore
├── LICENSE
├── README.md
├── examples
├── Dirichlet.ipynb
├── MVN (Cholesky).ipynb
├── MVN (LRA).ipynb
└── MVT (Cholesky).ipynb
├── pyboostlss
├── __init__.py
├── datasets
│ ├── __init__.py
│ ├── arcticlake.csv
│ ├── data_loader.py
│ ├── sim_triv_gaussian.csv
│ └── sim_triv_student.csv
├── distributions
│ ├── DIRICHLET.py
│ ├── MVN.py
│ ├── MVN_LRA.py
│ ├── MVT.py
│ ├── __init__.py
│ └── distribution_loss_metric.py
├── model.py
└── utils.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Pycharm
2 | .idea/
3 | dist/
4 | latex_distributions
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 |
9 |
10 | # Jupyter Notebook
11 | .ipynb_checkpoints
12 |
13 | # General
14 | Lib/
15 | Scripts/
16 | pyvenv.cfg
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Py-BoostLSS: An extension of Py-Boost to probabilistic modelling
2 |
3 | We present a probabilistic extension of the recently introduced [Py-Boost](https://github.com/sb-ai-lab/Py-Boost) approach and model all moments of a parametric multivariate distribution as functions of covariates. This allows us to create probabilistic predictions from which intervals and quantiles of interest can be derived.
4 |
5 | ## Motivation
6 |
7 | Existing implementations of Gradient Boosting Machines, such as [XGBoost](https://github.com/dmlc/xgboost) and [LightGBM](https://github.com/microsoft/LightGBM), are mostly designed for single-target regression tasks. While efficient for low to medium target-dimensions, the computational cost of estimating them becomes prohibitive in high-dimensional settings.
8 |
9 | As an example, consider modelling a multivariate Gaussian distribution with `D=100` target variables, where the covariance matrix is approximated using the Cholesky-Decomposition. Modelling all conditional moments (i.e., means, standard-deviations and all pairwise correlations) requires estimation of `D(D + 3)/2 = 5,150` parameters. Because most GBM implementations are based on a *one vs. all estimation strategy*, where a separate tree is grown for each parameter, estimating this many parameters for a large dataset can become computationally extremely expensive.
10 |
11 | The recently introduced [Py-Boost](https://github.com/sb-ai-lab/Py-Boost) approach provides a more runtime efficient GBM implementation, making it a good candidate for estimating high-dimensional target variables in a probabilistic setting. Borrowing from the original paper [SketchBoost: Fast Gradient Boosted Decision Tree for Multioutput Problems](https://openreview.net/forum?id=WSxarC8t-T), the following figure illustrates the runtime-efficiency of the Py-Boost model.
12 |
13 |
14 |
15 |
16 |
17 | Even though the original implementation of Py-Boost also supports estimation of univariate responses, Py-BoostLSS focuses on multi-target probabilistic regression settings. For univariate probabilistic GBMs, we refer to our implementations of [XGBoostLSS](https://github.com/StatMixedML/XGBoostLSS) and [LightGBMLSS](https://github.com/StatMixedML/LightGBMLSS).
18 |
19 | ## Installation
20 |
21 | Since Py-BoostLSS is entirely GPU-based, we first need to install the corresponding PyTorch and CuPy packages. If you are on Windows, it is preferable to install CuPy via conda. All other OS can use pip. You can check your cuda version with `nvcc --version`.
22 |
23 | ```python
24 | # CuPy (replace with your cuda version)
25 | # Windows only
26 | conda install -c conda-forge cupy cudatoolkit=11.x
27 | # Others
28 | pip install cupy-cuda11x
29 |
30 | # PyTorch (replace with your cuda version)
31 | pip3 install torch --extra-index-url https://download.pytorch.org/whl/cu11x
32 | ```
33 |
34 | Next, you can install Py-BoostLSS.
35 |
36 | ```python
37 | pip install git+https://github.com/StatMixedML/Py-BoostLSS.git
38 | ```
39 |
40 | ## How to use
41 | We refer to the [examples section](https://github.com/StatMixedML/Py-BoostLSS/tree/main/examples) for example notebooks.
42 |
43 | ## Available Distributions
44 | Py-BoostLSS currently supports the following distributions. More distribution follow soon.
45 |
46 | | Distribution | Usage |Type | Support
47 | | :----------------------------------------------------------: |:--------------: |:--------------------------------: | :-----------------------: |
48 | | Multivariate Normal (Cholesky) | `MVN()` | Continous (Multivariate) | $y \in (-\infty,\infty)$ |
49 | | Multivariate Normal (Low-Rank Approximation) | `MVN_LRA()` | Continous (Multivariate) | $y \in (-\infty,\infty)$ |
50 | | Multivariate Student-T (Cholesky) | `MVT()` | Continous (Multivariate) | $y \in (-\infty,\infty)$ |
51 | | Dirichlet | `DIRICHLET()` | Continous (Multivariate) | $y \in [0,1]$ |
52 |
53 |
54 |
60 |
61 |
62 |
63 |
64 | ## Feedback
65 | Please provide feedback on how to improve Py-BoostLSS, or if you request additional distributions to be implemented, by opening a new issue or via the discussion section.
66 |
67 |
68 | ## Acknowledgements
69 |
70 | The implementation of Py-BoostLSS relies on the following resources:
71 |
72 | - [Py-boost: a research tool for exploring GBDTs](https://github.com/sb-ai-lab/Py-Boost)
73 | - [SketchBoost: Fast Gradient Boosted Decision Tree for Multioutput Problems](https://openreview.net/forum?id=WSxarC8t-T)
74 |
75 | We genuinely thank the original authors [Anton Vakhrushev](https://www.kaggle.com/btbpanda) and [Leonid Iosipoi](http://iosipoi.com/) for making their work publicly available.
76 |
77 | ## Reference Paper
78 | [](https://arxiv.org/abs/2210.06831)
79 | [](https://arxiv.org/abs/2204.00778)
80 | [](https://arxiv.org/abs/1907.03178)
81 |
82 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/examples/Dirichlet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "22aca876-6dee-4cd3-a528-d2561a7ad301",
6 | "metadata": {},
7 | "source": [
8 | "
\n",
9 | "
Dirichlet Example
\n",
10 | "
\n",
11 | "\n",
12 | " \n",
13 | " "
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "c6cdf104-7c02-4888-968e-12b55187ef39",
19 | "metadata": {},
20 | "source": [
21 | "The Dirichlet distribution is commonly used for modelling non-negative compositional data, i.e., data that consist of sub-sets that are fractions of some total. Compositional data are typically represented as proportions or percentages summing to 100\\%, so that the Dirichlet extends the univariate beta-distribution to the multivariate case. Compositional data analysis (CoDa) is a branch of statistics that deals with multivariate observations carrying relative information and finds widespread use in ecology, economics or political science. As a result of the unit-sum constraint, models that use distributions designed for unconstrained data typically suffer from the problem of spurious correlation when applied to compositional data. "
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "538d04c1-08b3-488b-984d-312bc9dad21a",
27 | "metadata": {},
28 | "source": [
29 | "In this example, we model and predict all parameters of a Dirichlet distribution with $Y_{D}=3$ target variables using the famous Arctic-Lake dataset. The density of the Dirichlet distribution with parameters $\\mathbf{\\alpha}_{\\mathbf{x}} = (\\alpha_{\\mathbf{x},1}, \\ldots, \\alpha_{\\mathbf{x},D}) \\in \\mathbb{R}^{D}_{+}$ with $\\sum^{D}_{d=1}y_{d}=1$ for all $y_{d}\\in \\left[0,1\\right]$ is given by"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "id": "247dc33d-ac22-4e51-b500-d4b61fae21bf",
35 | "metadata": {},
36 | "source": [
37 | "$$\n",
38 | "f\\big(\\mathbf{y}|\\mathbf{\\theta}_{\\mathbf{x}}\\big) = \\frac{1}{\\mathrm{B}(\\mathbf{\\alpha}_{\\mathbf{x}})} \\prod_{d=1}^{D}y^{\\alpha_{\\mathbf{x},d-1}}_{d}\n",
39 | "$$"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "id": "86229dd0-bccf-41a1-800f-18892f451a21",
45 | "metadata": {},
46 | "source": [
47 | "To ensure positivity, we use $\\exp(\\alpha_{\\mathbf{x},d})$ for all $d=1,\\ldots, D$. The estimated parameters have the interpretation of providing the probability of an event falling into category $d$, i.e., $\\mathbb{E}(y_{d}) = \\frac{\\alpha_{d}}{\\alpha_{0}}$, with $\\alpha_{0} = \\sum^{D}_{d=1}\\alpha_{d}$. For more details, we refer to our related paper **[März, Alexander (2022), *Multi-Target XGBoostLSS Regression*](https://arxiv.org/abs/2210.06831)**.\n",
48 | "\n",
49 | " \n",
50 | " "
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "id": "d7b4eea0-96b6-482a-afc9-7ec867bf2193",
56 | "metadata": {},
57 | "source": [
58 | "# Imports"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 1,
64 | "id": "0de768b2-ef44-434d-89e4-402170de0eb9",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "import os\n",
69 | "import numpy as np\n",
70 | "import pandas as pd\n",
71 | "# Optional: set the device to run\n",
72 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
73 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
74 | "\n",
75 | "from pyboostlss.model import *\n",
76 | "from pyboostlss.distributions.DIRICHLET import *\n",
77 | "from pyboostlss.distributions.distribution_loss_metric import *\n",
78 | "from pyboostlss.utils import *\n",
79 | "from pyboostlss.datasets.data_loader import load_example_data\n",
80 | "\n",
81 | "import plotnine\n",
82 | "from plotnine import *\n",
83 | "plotnine.options.figure_size = (20, 10)"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "id": "8e90e9e4-5902-4299-9236-d31af4e29b2b",
89 | "metadata": {},
90 | "source": [
91 | "# Specifiy distribution and initialize model"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 2,
97 | "id": "75224245-9b11-4730-a4c6-d7fe6b3a272c",
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "distribution = DIRICHLET(D=3) # Dirichlet distribution, where D specifies the number of target variables\n",
102 | "pyblss = PyBoostLSS(distribution) # Initializes model with specified distribution"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "43c36cd5-11f4-4c03-8297-c2555924bd4a",
108 | "metadata": {},
109 | "source": [
110 | "# Data"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 3,
116 | "id": "78118dfc-6b05-4634-ae15-9fbf3d30b47b",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "data_df = load_example_data(\"arcticlake.csv\")\n",
121 | "\n",
122 | "# Create 80%, 10%, 10% split for train, validation and test \n",
123 | "train, validate, test = np.split(data_df.sample(frac=1,random_state=123), [int(0.8*len(data_df)), int(0.9*len(data_df))])\n",
124 | "\n",
125 | "# Train\n",
126 | "x_train = train[\"depth\"].values.reshape(-1,1)\n",
127 | "y_train = train.drop(columns=\"depth\").values\n",
128 | "dtrain = {\"X\": x_train, \"y\": y_train}\n",
129 | "\n",
130 | "# Validation\n",
131 | "x_eval = validate[\"depth\"].values.reshape(-1,1)\n",
132 | "y_eval = validate.drop(columns=\"depth\").values\n",
133 | "eval_sets = [{'X': x_eval, 'y': y_eval}] # Specifies eval_sets on which the model is evaluated on\n",
134 | "\n",
135 | "# Test\n",
136 | "x_test = test[\"depth\"].values.reshape(-1,1)\n",
137 | "y_test = test.drop(columns=\"depth\").values"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "id": "e742234a-ad21-4a32-aa80-08c59d318503",
143 | "metadata": {},
144 | "source": [
145 | "# Hyper-Parameter Optimization via Optuna"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 4,
151 | "id": "88d8403d-ea56-4d44-ba26-1f86632b7b78",
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "name": "stderr",
156 | "output_type": "stream",
157 | "text": [
158 | "\u001b[32m[I 2022-12-08 11:33:16,505]\u001b[0m A new study created in memory with name: Py-BoostLSS Hyper-Parameter Optimization\u001b[0m\n",
159 | "C:\\Users\\Alexander\\.julia\\v0.6\\Conda\\deps\\usr\\envs\\pyboost\\lib\\site-packages\\optuna\\progress_bar.py:49: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.\n"
160 | ]
161 | },
162 | {
163 | "data": {
164 | "application/vnd.jupyter.widget-view+json": {
165 | "model_id": "08991189c4d8418e86d16f8f877de7a6",
166 | "version_major": 2,
167 | "version_minor": 0
168 | },
169 | "text/plain": [
170 | " 0%| | 0/10 [00:00, ?it/s]"
171 | ]
172 | },
173 | "metadata": {},
174 | "output_type": "display_data"
175 | },
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "[11:33:25] Stdout logging level is INFO.\n",
181 | "[11:33:25] GDBT train starts. Max iter 500, early stopping rounds 20\n",
182 | "[11:33:31] Iter 0; Sample 0, NLL-score = -5.127476677854759; \n",
183 | "[11:33:35] Early stopping at iter 190, best iter 170, best_score -10.250120833771959\n",
184 | "\u001b[32m[I 2022-12-08 11:33:36,535]\u001b[0m Trial 0 finished with value: -10.2501292414331 and parameters: {'lr': 0.2261559634886368, 'max_depth': 1, 'sketch_outputs': 7, 'lambda_l2': 30.88877600549751, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 71.47732298654863}. Best is trial 0 with value: -10.2501292414331.\u001b[0m\n",
185 | "[11:33:36] Stdout logging level is INFO.\n",
186 | "[11:33:36] GDBT train starts. Max iter 500, early stopping rounds 20\n",
187 | "[11:33:36] Iter 0; Sample 0, NLL-score = -4.7779778049753485; \n",
188 | "[11:33:38] Early stopping at iter 62, best iter 42, best_score -9.890179422452471\n",
189 | "\u001b[32m[I 2022-12-08 11:33:38,303]\u001b[0m Trial 1 finished with value: -9.890167654880095 and parameters: {'lr': 0.5377869141302691, 'max_depth': 2, 'sketch_outputs': 10, 'lambda_l2': 22.04734477729385, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 242.25332392584508}. Best is trial 0 with value: -10.2501292414331.\u001b[0m\n",
190 | "[11:33:38] Stdout logging level is INFO.\n",
191 | "[11:33:38] GDBT train starts. Max iter 500, early stopping rounds 20\n",
192 | "[11:33:38] Iter 0; Sample 0, NLL-score = -4.7779771517360095; \n",
193 | "[11:33:45] Early stopping at iter 297, best iter 277, best_score -9.79587912904152\n",
194 | "\u001b[32m[I 2022-12-08 11:33:46,318]\u001b[0m Trial 2 finished with value: -9.795877693087373 and parameters: {'lr': 0.13006341497658322, 'max_depth': 4, 'sketch_outputs': 9, 'lambda_l2': 29.47977136537219, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 411.4277549678564}. Best is trial 0 with value: -10.2501292414331.\u001b[0m\n",
195 | "[11:33:46] Stdout logging level is INFO.\n",
196 | "[11:33:46] GDBT train starts. Max iter 500, early stopping rounds 20\n",
197 | "[11:33:46] Iter 0; Sample 0, NLL-score = -4.777977646926431; \n",
198 | "[11:33:51] Early stopping at iter 229, best iter 209, best_score -9.617372769127643\n",
199 | "\u001b[32m[I 2022-12-08 11:33:51,969]\u001b[0m Trial 3 finished with value: -9.617365965817168 and parameters: {'lr': 0.3865616908849664, 'max_depth': 2, 'sketch_outputs': 2, 'lambda_l2': 14.504028702160326, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 214.03968800602453}. Best is trial 0 with value: -10.2501292414331.\u001b[0m\n",
200 | "[11:33:52] Stdout logging level is INFO.\n",
201 | "[11:33:52] GDBT train starts. Max iter 500, early stopping rounds 20\n",
202 | "[11:33:52] Iter 0; Sample 0, NLL-score = -4.777978848671246; \n",
203 | "[11:33:53] Early stopping at iter 59, best iter 39, best_score -10.45860715688298\n",
204 | "\u001b[32m[I 2022-12-08 11:33:53,525]\u001b[0m Trial 4 finished with value: -10.458604924345618 and parameters: {'lr': 0.919274500979721, 'max_depth': 3, 'sketch_outputs': 3, 'lambda_l2': 4.634818879126481, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 375.82206166522025}. Best is trial 4 with value: -10.458604924345618.\u001b[0m\n",
205 | "[11:33:53] Stdout logging level is INFO.\n",
206 | "[11:33:53] GDBT train starts. Max iter 500, early stopping rounds 20\n",
207 | "[11:33:53] Iter 0; Sample 0, NLL-score = -5.208636852165513; \n",
208 | "[11:33:57] Early stopping at iter 151, best iter 131, best_score -9.509026979370176\n",
209 | "\u001b[32m[I 2022-12-08 11:33:57,534]\u001b[0m Trial 5 finished with value: -9.50902935905508 and parameters: {'lr': 0.16105845253307519, 'max_depth': 2, 'sketch_outputs': 9, 'lambda_l2': 10.693996516830357, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 393.0733498821459}. Best is trial 4 with value: -10.458604924345618.\u001b[0m\n",
210 | "[11:33:57] Stdout logging level is INFO.\n",
211 | "[11:33:57] GDBT train starts. Max iter 500, early stopping rounds 20\n",
212 | "[11:33:57] Iter 0; Sample 0, NLL-score = -4.777977950293839; \n",
213 | "[11:33:58] Early stopping at iter 63, best iter 43, best_score -8.59497991615099\n",
214 | "\u001b[32m[I 2022-12-08 11:33:59,107]\u001b[0m Trial 6 finished with value: -8.594978287635527 and parameters: {'lr': 0.505295942635048, 'max_depth': 1, 'sketch_outputs': 4, 'lambda_l2': 8.756934127722555, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 460.6811644415763}. Best is trial 4 with value: -10.458604924345618.\u001b[0m\n",
215 | "[11:33:59] Stdout logging level is INFO.\n",
216 | "[11:33:59] GDBT train starts. Max iter 500, early stopping rounds 20\n",
217 | "[11:33:59] Iter 0; Sample 0, NLL-score = -4.777978030971868; \n",
218 | "[11:34:01] Early stopping at iter 85, best iter 65, best_score -9.566335474922479\n",
219 | "\u001b[32m[I 2022-12-08 11:34:01,309]\u001b[0m Trial 7 finished with value: -9.566331399467114 and parameters: {'lr': 0.6563189495925507, 'max_depth': 1, 'sketch_outputs': 3, 'lambda_l2': 19.24910924466795, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 183.3348016026964}. Best is trial 4 with value: -10.458604924345618.\u001b[0m\n",
220 | "[11:34:01] Stdout logging level is INFO.\n",
221 | "[11:34:01] GDBT train starts. Max iter 500, early stopping rounds 20\n",
222 | "[11:34:01] Iter 0; Sample 0, NLL-score = -4.777978130290063; \n",
223 | "[11:34:04] Early stopping at iter 145, best iter 125, best_score -8.423402068697614\n",
224 | "\u001b[32m[I 2022-12-08 11:34:04,995]\u001b[0m Trial 8 finished with value: -8.423397062040415 and parameters: {'lr': 0.5244406900404303, 'max_depth': 3, 'sketch_outputs': 1, 'lambda_l2': 2.200791726816238, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 432.4407876376129}. Best is trial 4 with value: -10.458604924345618.\u001b[0m\n",
225 | "[11:34:05] Stdout logging level is INFO.\n",
226 | "[11:34:05] GDBT train starts. Max iter 500, early stopping rounds 20\n",
227 | "[11:34:05] Iter 0; Sample 0, NLL-score = -4.777977557240683; \n",
228 | "[11:34:08] Early stopping at iter 157, best iter 137, best_score -8.691995518775983\n",
229 | "\u001b[32m[I 2022-12-08 11:34:09,090]\u001b[0m Trial 9 finished with value: -8.691999712136926 and parameters: {'lr': 0.3743890448500777, 'max_depth': 3, 'sketch_outputs': 4, 'lambda_l2': 21.375505387520626, 'colsample': 1.0, 'subsample': 1.0, 'min_gain_to_split': 363.2284392550362}. Best is trial 4 with value: -10.458604924345618.\u001b[0m\n",
230 | "\n",
231 | "Hyper-Parameter Optimization successfully finished.\n",
232 | " Number of finished trials: 10\n",
233 | " Best trial:\n",
234 | " Value: -10.458604924345618\n",
235 | " Params: \n",
236 | " lr: 0.919274500979721\n",
237 | " max_depth: 3\n",
238 | " sketch_outputs: 3\n",
239 | " lambda_l2: 4.634818879126481\n",
240 | " colsample: 1.0\n",
241 | " subsample: 1.0\n",
242 | " min_gain_to_split: 375.82206166522025\n",
243 | " opt_rounds: 39\n"
244 | ]
245 | }
246 | ],
247 | "source": [
248 | "np.random.seed(123)\n",
249 | "\n",
250 | "# Specifies the hyper-parameters and their value range\n",
251 | " # The structure is as follows: \"hyper-parameter\": [lower_bound, upper_bound]\n",
252 | " # Currently, only the following hyper-parameters can be optimized\n",
253 | " \n",
254 | "hp_dict = {\"lr\": [1e-3, 1], \n",
255 | " \"max_depth\": [1, 4],\n",
256 | " \"sketch_outputs\": [1,10],\n",
257 | " \"lambda_l2\": [0, 40], \n",
258 | " \"colsample\": [1.0, 1.0], # increased to max due to small size of dataset\n",
259 | " \"subsample\": [1.0, 1.0], # increased to max due to small size of dataset\n",
260 | " \"min_gain_to_split\": [0, 500] \n",
261 | " } \n",
262 | "\n",
263 | "opt_param = pyblss.hyper_opt(params=hp_dict,\n",
264 | " dtrain=dtrain,\n",
265 | " eval_sets=eval_sets,\n",
266 | " use_hess=True, \n",
267 | " sketch_method=\"proj\",\n",
268 | " hp_seed=123, # Seed for random number generator used in the Bayesian hyper-parameter search.\n",
269 | " ntrees=500, # Number of boosting iterations.\n",
270 | " es=20, # Early stopping rounds\n",
271 | " n_trials=10, # The number of trials. If this argument is set to None, there is no limitation on the number of trials.\n",
272 | " max_minutes=120, # Time budget in minutes, i.e., stop study after the given number of minutes.\n",
273 | " silence=False) # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail."
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "id": "53213379-e7b1-4a4f-8d85-dda5a729f608",
279 | "metadata": {},
280 | "source": [
281 | "# Model Training"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 5,
287 | "id": "11b9d38e-fee1-45fd-96cf-4ade74849593",
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "name": "stdout",
292 | "output_type": "stream",
293 | "text": [
294 | "[11:34:09] Stdout logging level is INFO.\n",
295 | "[11:34:09] GDBT train starts. Max iter 39, early stopping rounds 100\n",
296 | "[11:34:09] Iter 0; \n",
297 | "[11:34:09] Iter 38; \n"
298 | ]
299 | }
300 | ],
301 | "source": [
302 | "opt_params = opt_param.copy()\n",
303 | "\n",
304 | "pyboostlss_model = pyblss.train(dtrain=dtrain,\n",
305 | " lr=opt_params[\"lr\"], \n",
306 | " lambda_l2=opt_params[\"lambda_l2\"],\n",
307 | " max_depth=opt_params[\"max_depth\"],\n",
308 | " sketch_outputs=opt_params[\"sketch_outputs\"],\n",
309 | " colsample=opt_params[\"colsample\"],\n",
310 | " subsample=opt_params[\"subsample\"],\n",
311 | " min_gain_to_split=opt_params[\"min_gain_to_split\"],\n",
312 | " ntrees=opt_params[\"opt_rounds\"],\n",
313 | " use_hess=True,\n",
314 | " verbose=100, \n",
315 | " sketch_method=\"proj\",\n",
316 | " seed=123)"
317 | ]
318 | },
319 | {
320 | "cell_type": "markdown",
321 | "id": "48228365-d49a-4481-868a-35b7e97535f5",
322 | "metadata": {},
323 | "source": [
324 | "# Predict"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 6,
330 | "id": "2c126862-1732-4033-a24f-c9e5287fc1ea",
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "data": {
335 | "text/html": [
336 | "
\n",
337 | "\n",
350 | "
\n",
351 | " \n",
352 | "
\n",
353 | "
\n",
354 | "
alpha_1
\n",
355 | "
alpha_2
\n",
356 | "
alpha_3
\n",
357 | "
\n",
358 | " \n",
359 | " \n",
360 | "
\n",
361 | "
0
\n",
362 | "
8.925608
\n",
363 | "
5.909983
\n",
364 | "
1.225174
\n",
365 | "
\n",
366 | "
\n",
367 | "
1
\n",
368 | "
8.925608
\n",
369 | "
5.909983
\n",
370 | "
1.225174
\n",
371 | "
\n",
372 | "
\n",
373 | "
2
\n",
374 | "
1.241373
\n",
375 | "
10.116554
\n",
376 | "
4.594018
\n",
377 | "
\n",
378 | "
\n",
379 | "
3
\n",
380 | "
1.782198
\n",
381 | "
11.356755
\n",
382 | "
9.836135
\n",
383 | "
\n",
384 | "
\n",
385 | "
4
\n",
386 | "
8.925608
\n",
387 | "
5.909983
\n",
388 | "
1.225174
\n",
389 | "
\n",
390 | " \n",
391 | "
\n",
392 | "
"
393 | ],
394 | "text/plain": [
395 | " alpha_1 alpha_2 alpha_3\n",
396 | "0 8.925608 5.909983 1.225174\n",
397 | "1 8.925608 5.909983 1.225174\n",
398 | "2 1.241373 10.116554 4.594018\n",
399 | "3 1.782198 11.356755 9.836135\n",
400 | "4 8.925608 5.909983 1.225174"
401 | ]
402 | },
403 | "execution_count": 6,
404 | "metadata": {},
405 | "output_type": "execute_result"
406 | }
407 | ],
408 | "source": [
409 | "# Predicts transformed parameters of the specified distribution. \n",
410 | "predt_params = distribution.predict(model=pyboostlss_model,\n",
411 | " X_test=x_train, # Here we use the train dataset to later infer the partial dependence of the parameters on x\n",
412 | " pred_type=\"parameters\")\n",
413 | "predt_params.head()"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "id": "6dc11354-6241-4f53-b860-e12747660b75",
419 | "metadata": {},
420 | "source": [
421 | "Please note that the predicted parameters are not yet on the response-scale. Yet we can transform them easily as described above: the estimated parameters have the interpretation of providing the probability of an event falling into category $d$, i.e., $\\mathbb{E}(y_{d}) = \\frac{\\alpha_{d}}{\\alpha_{0}}$, with $\\alpha_{0} = \\sum^{D}_{d=1}\\alpha_{d}$."
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 10,
427 | "id": "9f21270c-a600-4104-8c99-60bcfa0e802f",
428 | "metadata": {},
429 | "outputs": [
430 | {
431 | "data": {
432 | "text/html": [
433 | "
\n",
434 | "\n",
447 | "
\n",
448 | " \n",
449 | "
\n",
450 | "
\n",
451 | "
sand
\n",
452 | "
silt
\n",
453 | "
clay
\n",
454 | "
\n",
455 | " \n",
456 | " \n",
457 | "
\n",
458 | "
0
\n",
459 | "
0.555740
\n",
460 | "
0.367976
\n",
461 | "
0.076284
\n",
462 | "
\n",
463 | "
\n",
464 | "
1
\n",
465 | "
0.555740
\n",
466 | "
0.367976
\n",
467 | "
0.076284
\n",
468 | "
\n",
469 | "
\n",
470 | "
2
\n",
471 | "
0.077820
\n",
472 | "
0.634189
\n",
473 | "
0.287991
\n",
474 | "
\n",
475 | "
\n",
476 | "
3
\n",
477 | "
0.077571
\n",
478 | "
0.494307
\n",
479 | "
0.428122
\n",
480 | "
\n",
481 | "
\n",
482 | "
4
\n",
483 | "
0.555740
\n",
484 | "
0.367976
\n",
485 | "
0.076284
\n",
486 | "
\n",
487 | " \n",
488 | "
\n",
489 | "
"
490 | ],
491 | "text/plain": [
492 | " sand silt clay\n",
493 | "0 0.555740 0.367976 0.076284\n",
494 | "1 0.555740 0.367976 0.076284\n",
495 | "2 0.077820 0.634189 0.287991\n",
496 | "3 0.077571 0.494307 0.428122\n",
497 | "4 0.555740 0.367976 0.076284"
498 | ]
499 | },
500 | "execution_count": 10,
501 | "metadata": {},
502 | "output_type": "execute_result"
503 | }
504 | ],
505 | "source": [
506 | "# Transform to response scale\n",
507 | "predt_params_transf = predt_params / predt_params.sum(axis=1)\n",
508 | "predt_params_transf = predt_params.div(predt_params.sum(axis=1), axis=0)\n",
509 | "predt_params_transf.columns = data_df.iloc[:,:3].columns\n",
510 | "\n",
511 | "predt_params_transf.head()"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 8,
517 | "id": "c7e1a565-b8a6-4864-861e-75a81d558528",
518 | "metadata": {},
519 | "outputs": [
520 | {
521 | "data": {
522 | "text/plain": [
523 | "(10000, 4, 3)"
524 | ]
525 | },
526 | "execution_count": 8,
527 | "metadata": {},
528 | "output_type": "execute_result"
529 | }
530 | ],
531 | "source": [
532 | "# Draws random samples from the predicted distribution\n",
533 | "torch.manual_seed(123)\n",
534 | "n_samples = 10000\n",
535 | "predt_samples = distribution.predict(model=pyboostlss_model,\n",
536 | " X_test=x_test, \n",
537 | " pred_type=\"samples\", \n",
538 | " n_samples=n_samples)\n",
539 | "\n",
540 | "predt_samples.shape # Output-shape is (n_samples, n_obs, n_target)"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "id": "6d32e5fd-ada9-41a8-896e-b446e606a3d2",
546 | "metadata": {},
547 | "source": [
548 | "# Partial Dependence Plot"
549 | ]
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "id": "178932c6-32b8-4b45-a264-db6844e62d7d",
554 | "metadata": {},
555 | "source": [
556 | "Since there is only one covariate in the dataset, we can infer the effect of depth (in meters) on the sediment composition using a scatter-smooth estimate. The figure shows that with increasing depth, the relative frequency of sand decreases while the proportion of silt and clay increases."
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 9,
562 | "id": "2e79d446-6617-46ce-ae8b-c5209f26abb8",
563 | "metadata": {},
564 | "outputs": [
565 | {
566 | "data": {
567 | "image/png": "\n",
568 | "text/plain": [
569 | ""
570 | ]
571 | },
572 | "metadata": {},
573 | "output_type": "display_data"
574 | },
575 | {
576 | "name": "stdout",
577 | "output_type": "stream",
578 | "text": [
579 | "\n"
580 | ]
581 | }
582 | ],
583 | "source": [
584 | "plot_df = predt_params_transf\n",
585 | "plot_df[\"depth\"] = x_train\n",
586 | "\n",
587 | "plot_df = pd.melt(plot_df,\n",
588 | " id_vars=\"depth\")\n",
589 | "\n",
590 | "param_plot = (ggplot(plot_df,\n",
591 | " aes(x=\"depth\",\n",
592 | " y=\"value\")) + \n",
593 | " geom_point() + \n",
594 | " geom_smooth(span=0.7, se=False) + \n",
595 | " facet_wrap(\"variable\",\n",
596 | " scales=\"free\") + \n",
597 | " theme_bw() + \n",
598 | " theme(subplots_adjust={\"wspace\": 0.25}) + \n",
599 | " labs(title = \"Partial-Dependence-Plot of Dirichlet-Parameters estimated via Py-BoostLSS\\n\",\n",
600 | " y=\"Parameter Estimate\",\n",
601 | " x=\"Depth (in meters)\") \n",
602 | " )\n",
603 | "\n",
604 | "print(param_plot)"
605 | ]
606 | }
607 | ],
608 | "metadata": {
609 | "kernelspec": {
610 | "display_name": "Python 3 (ipykernel)",
611 | "language": "python",
612 | "name": "python3"
613 | },
614 | "language_info": {
615 | "codemirror_mode": {
616 | "name": "ipython",
617 | "version": 3
618 | },
619 | "file_extension": ".py",
620 | "mimetype": "text/x-python",
621 | "name": "python",
622 | "nbconvert_exporter": "python",
623 | "pygments_lexer": "ipython3",
624 | "version": "3.9.15"
625 | }
626 | },
627 | "nbformat": 4,
628 | "nbformat_minor": 5
629 | }
630 |
--------------------------------------------------------------------------------
/pyboostlss/__init__.py:
--------------------------------------------------------------------------------
1 | """Py-BoostLSS - An extension of Py-Boost to probabilistic modelling"""
--------------------------------------------------------------------------------
/pyboostlss/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """Py-BoostLSS - An extension of Py-Boost to probabilistic modelling"""
--------------------------------------------------------------------------------
/pyboostlss/datasets/arcticlake.csv:
--------------------------------------------------------------------------------
1 | sand,silt,clay,depth
2 | 0.775,0.195,0.03,10.4
3 | 0.719,0.249,0.032,11.7
4 | 0.507,0.361,0.132,12.8
5 | 0.52357,0.41023,0.0662,13
6 | 0.7,0.265,0.035,15.7
7 | 0.665,0.322,0.013,16.3
8 | 0.431,0.553,0.016,18
9 | 0.534,0.368,0.098,18.7
10 | 0.155,0.544,0.301,20.7
11 | 0.317,0.415,0.268,22.1
12 | 0.657,0.278,0.065,22.4
13 | 0.704,0.29,0.006,24.4
14 | 0.174,0.536,0.29,25.8
15 | 0.106,0.698,0.196,32.5
16 | 0.382,0.431,0.187,33.6
17 | 0.108,0.527,0.365,36.8
18 | 0.184,0.507,0.309,37.8
19 | 0.046,0.474,0.48,36.9
20 | 0.156,0.504,0.34,42.2
21 | 0.319,0.451,0.23,47
22 | 0.095,0.535,0.37,47.1
23 | 0.171,0.48,0.349,48.4
24 | 0.105,0.554,0.341,49.4
25 | 0.04776,0.54428,0.40796,49.5
26 | 0.026,0.452,0.522,59.2
27 | 0.114,0.527,0.359,60.1
28 | 0.067,0.469,0.464,61.7
29 | 0.069,0.497,0.434,62.4
30 | 0.04,0.449,0.511,69.3
31 | 0.07407,0.51652,0.40941,73.6
32 | 0.048,0.495,0.457,74.4
33 | 0.045,0.485,0.47,78.5
34 | 0.066,0.521,0.413,82.9
35 | 0.06707,0.47347,0.45946,87.7
36 | 0.07407,0.45646,0.46947,88.1
37 | 0.06,0.489,0.451,90.4
38 | 0.063,0.538,0.399,90.6
39 | 0.025,0.48,0.495,97.7
40 | 0.02,0.478,0.502,103.7
41 |
--------------------------------------------------------------------------------
/pyboostlss/datasets/data_loader.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 | import pandas as pd
3 |
4 |
5 | def load_example_data(dta_name: str) -> pd.DataFrame:
6 | """Returns dataframe of a sepecified simulated dataset example.
7 | """
8 | data_path = pkg_resources.resource_stream(__name__, dta_name)
9 | data_df = pd.read_csv(data_path)
10 |
11 | return data_df
--------------------------------------------------------------------------------
/pyboostlss/distributions/DIRICHLET.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.distributions.dirichlet import Dirichlet
3 | from dirichlet.dirichlet import mle as dirichlet_mle
4 | import cupy as cp
5 | import numpy as np
6 | import pandas as pd
7 | from pyboostlss.utils import *
8 |
9 |
10 |
11 | ########################################################################################################################
12 | ############################################### Dirichlet Distribution #########################################
13 | ########################################################################################################################
14 |
15 | class DIRICHLET:
16 | """Dirichlet Distribution Class
17 | """
18 |
19 | def __init__(self, D:int):
20 | self.D = D # specifies target dimension
21 |
22 |
23 | def initialize(self, y_true: cp.ndarray, n_target: int) -> cp.ndarray:
24 | """ Function that calculates the starting values, for each distributional parameter individually.
25 | y_true: cp.ndarray
26 | Data from which starting values are calculated.
27 | n_target: ndarray
28 | Number of target variables
29 | """
30 |
31 | start_values = np.log(dirichlet_mle(cp.asnumpy(y_true)))
32 |
33 | return cp.array(start_values)
34 |
35 |
36 | def n_dist_param(self, n_targets: int) -> int:
37 | """Infers the number of distributional parameters from target dimension.
38 | """
39 |
40 | return n_targets
41 |
42 |
43 |
44 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray:
45 | """Function that appends target to the number of specified parameters
46 | """
47 |
48 | return cp.array(y_true)
49 |
50 |
51 |
52 |
53 | def create_param_dict(self, n_target):
54 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions.
55 | """
56 |
57 | # Alpha
58 | param_dict = {"alpha_" + str(i+1): exp_fn for i in range(n_target)}
59 |
60 | return param_dict
61 |
62 |
63 |
64 |
65 |
66 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor:
67 | """ Returns estimated parameters and nll.
68 |
69 | Args:
70 | y_true: cp.ndarray, Input target variables
71 | y_pred: cp.ndarray, predictions
72 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation
73 |
74 | Returns:
75 | predt, nll
76 | """
77 |
78 | ###
79 | # Initialize
80 | ###
81 | n_target = n_param = self.D
82 | param_dict = self.create_param_dict(n_target)
83 |
84 |
85 | ###
86 | # Target
87 | ###
88 | target = torch.as_tensor(y_true, device="cuda").reshape(-1, n_target)
89 |
90 |
91 |
92 | ###
93 | # Parameters
94 | ###
95 | predt = [
96 | torch.tensor(
97 | y_pred[:,i].reshape(-1,1), device="cuda", requires_grad=requires_grad
98 | ) for i in range(n_param)
99 | ]
100 |
101 | # Alpha
102 | predt_alpha = torch.concat(
103 | [response_fun(predt[i]) for i, (dist_param, response_fun) in enumerate(param_dict.items())],
104 | axis=1
105 | )
106 |
107 |
108 |
109 | ###
110 | # NLL
111 | ###
112 | dist_fit = Dirichlet(predt_alpha)
113 | nll = -torch.nansum(dist_fit.log_prob(target))
114 |
115 | return predt, nll
116 |
117 |
118 |
119 |
120 | def predict(self,
121 | model,
122 | X_test: np.array,
123 | pred_type: str = "parameters",
124 | n_samples: int = 100
125 | ):
126 | """
127 | Predict function.
128 |
129 | model:
130 | Instance of pyboostlss
131 | X_test: np.array
132 | Test data features
133 | pred_type: str
134 | Specifies what is to be predicted:
135 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target)
136 | "parameters": returns the predicted distributional parameters.
137 | n_samples: int
138 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution.
139 | Returns
140 | -------
141 | pd.DataFrame with n_samples drawn from predicted response distribution.
142 |
143 | """
144 |
145 | n_target = self.D
146 | param_dict = self.create_param_dict(n_target)
147 | dist_params = list(param_dict.keys())
148 |
149 | # Predicted parameters
150 | params_predt = torch.tensor(model.predict(X_test), device="cuda")
151 | params_predt = torch.cat(
152 | [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())],
153 | axis=1)
154 |
155 | # Predicted Distribution
156 | dirichlet_pred = Dirichlet(params_predt)
157 |
158 | # Output DataFrame
159 | predt_params = pd.DataFrame(params_predt.cpu().detach().numpy(),columns=dist_params)
160 |
161 | if pred_type == "parameters":
162 | return predt_params
163 |
164 | elif pred_type == "samples":
165 | torch.manual_seed(123)
166 | y_samples = dirichlet_pred.sample((n_samples,)).cpu().detach().numpy()
167 | return y_samples
168 |
--------------------------------------------------------------------------------
/pyboostlss/distributions/MVN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.distributions.multivariate_normal import MultivariateNormal
3 | import cupy as cp
4 | import numpy as np
5 | import pandas as pd
6 | from pyboostlss.utils import *
7 |
8 |
9 |
10 | ########################################################################################################################
11 | ############################################### Multivariate Normal ##########################################
12 | ########################################################################################################################
13 |
14 | class MVN:
15 | """Multivariate Normal Distribution Class, where covariance matrix \Sigma is estimated via Cholesky-decomposition
16 | \Sigma = LL`.
17 | """
18 |
19 | def __init__(self,D:int):
20 | self.D = D # specifies target dimension
21 |
22 |
23 | def initialize(self, y_true: cp.ndarray, n_target: int) -> cp.ndarray:
24 | """ Function that calculates the starting values, for each distributional parameter individually.
25 | y_true: cp.ndarray
26 | Data from which starting values are calculated.
27 | n_target: ndarray
28 | Number of target variables
29 | """
30 | # Indices
31 | tril_indices = cp.asarray(np.tril_indices(n_target))
32 |
33 | # Target
34 | target = y_true[:,:n_target]
35 |
36 | # Location
37 | loc_init = cp.mean(target,axis=0)
38 |
39 | # Tril
40 | tril_init = cp.cov(target,rowvar=False)
41 | tril_init = cp.linalg.cholesky(tril_init)
42 | cp.fill_diagonal(tril_init, cp.log(cp.diagonal(tril_init)))
43 | tril_init = tril_init[tril_indices[0], tril_indices[1]]
44 | start_values = cp.concatenate([loc_init, tril_init])
45 |
46 | return start_values
47 |
48 |
49 |
50 | def n_dist_param(self, n_targets: int) -> int:
51 | """Infers the number of distributional parameters from target dimension.
52 | """
53 | n_param = int((n_targets*(n_targets + 3))/2)
54 |
55 | return n_param
56 |
57 |
58 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray:
59 | """Function that appends target to the number of specified parameters
60 | """
61 | n_obs = y_true.shape[0]
62 | n_target = y_true.shape[1]
63 | n_fill = n_param - n_target
64 | np_fill = np.ones((n_obs, n_fill))
65 | y_append = np.concatenate([y_true, np_fill],axis=1)
66 |
67 | return y_append
68 |
69 |
70 |
71 | def tril_dim(self, n_target: int) -> int:
72 | """Infers the number of lower diagonal elements from number of targets.
73 | """
74 | n_tril = int((n_target * (n_target + 1)) / 2)
75 |
76 | return n_tril
77 |
78 |
79 | def rho_dim(self, n_target: int) -> int:
80 | """Infers the number of correlations from number of targets.
81 | """
82 | n_rho = int((n_target * (n_target - 1)) / 2)
83 | return n_rho
84 |
85 |
86 |
87 | def create_param_dict(self, n_target, tril_indices):
88 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions.
89 | """
90 |
91 | n_theta = self.n_dist_param(n_target)
92 | n_tril = self.tril_dim(n_target)
93 |
94 |
95 | # Location
96 | param_dict = {"location_" + str(i+1): identity_fn for i in range(n_target)}
97 |
98 | # Tril
99 | tril_idx = (tril_indices.detach().numpy()) + 1
100 | tril_indices_row = tril_idx[0]
101 | tril_indices_col = tril_idx[1]
102 | tril_diag = tril_idx[0] == tril_idx[1]
103 |
104 | tril_dict = {}
105 |
106 | for i in range(n_tril):
107 | if tril_diag[i] == True:
108 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn})
109 | else:
110 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn})
111 |
112 | param_dict.update(tril_dict)
113 |
114 | return param_dict
115 |
116 |
117 |
118 | def create_tril_dict(self, n_target, tril_indices):
119 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions.
120 | """
121 |
122 | n_theta = self.n_dist_param(n_target)
123 | n_tril = self.tril_dim(n_target)
124 |
125 | # Tril
126 | tril_idx = (tril_indices.detach().numpy()) + 1
127 | tril_indices_row = tril_idx[0]
128 | tril_indices_col = tril_idx[1]
129 | tril_diag = tril_idx[0] == tril_idx[1]
130 |
131 | tril_dict = {}
132 |
133 | for i in range(n_tril):
134 | if tril_diag[i] == True:
135 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn})
136 | else:
137 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn})
138 |
139 | return tril_dict
140 |
141 |
142 |
143 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor:
144 | """ Returns estimated parameters and nll.
145 |
146 | Args:
147 | y_true: cp.ndarray, Input target variables
148 | y_pred: cp.ndarray, predictions
149 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation
150 |
151 | Returns:
152 | predt, nll
153 | """
154 |
155 | ###
156 | # Initialize
157 | ###
158 | n_obs = y_true.shape[0]
159 | n_param = y_true.shape[1]
160 | n_target = self.D
161 | n_tril = self.tril_dim(n_target)
162 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0)
163 | param_dict = self.create_param_dict(n_target,tril_indices)
164 | tril_param_dict = self.create_tril_dict(n_target,tril_indices)
165 |
166 |
167 | ###
168 | # Target
169 | ###
170 | target = torch.as_tensor(y_true[:,:n_target], device="cuda").reshape(-1, n_target)
171 |
172 |
173 |
174 | ###
175 | # Parameters
176 | ###
177 | predt = [torch.tensor(y_pred[:,i].reshape(-1,1), device="cuda", requires_grad=requires_grad) for i in range(n_param)]
178 |
179 | # Location
180 | predt_location = torch.concat(predt[:n_target],axis=1)
181 |
182 | # Tril
183 | tril_predt = predt[n_target:]
184 | tril_predt = [response_fun(tril_predt[i]) for i, (dist_param, response_fun) in enumerate(tril_param_dict.items())]
185 | tril_predt = torch.concat(tril_predt,axis=1)
186 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda")
187 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt
188 |
189 |
190 | ###
191 | # NLL
192 | ###
193 | dist_fit = MultivariateNormal(loc=predt_location, scale_tril=predt_tril)
194 | nll = -torch.nansum(dist_fit.log_prob(target))
195 |
196 | return predt, nll
197 |
198 |
199 |
200 |
201 | def predict(self,
202 | model,
203 | X_test: np.array,
204 | pred_type: str = "parameters",
205 | n_samples: int = 100
206 | ):
207 | """
208 | Predict function.
209 |
210 | model:
211 | Instance of pyboostlss
212 | X_test: np.array
213 | Test data features
214 | pred_type: str
215 | Specifies what is to be predicted:
216 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target)
217 | "parameters": returns the predicted distributional parameters.
218 | n_samples: int
219 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution.
220 | Returns
221 | -------
222 | pd.DataFrame with n_samples drawn from predicted response distribution.
223 |
224 | """
225 |
226 | n_target = self.D
227 | n_tril = self.tril_dim(n_target)
228 | n_rho = self.rho_dim(n_target)
229 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0)
230 | param_dict = self.create_param_dict(n_target,tril_indices)
231 | dist_params = list(param_dict.keys())
232 |
233 | # Predicted parameters
234 | params_predt = torch.tensor(model.predict(X_test), device="cuda")
235 | params_predt = [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())]
236 |
237 |
238 | # Location
239 | predt_location = torch.cat(params_predt[:n_target],axis=1)
240 | predt_location_df = pd.DataFrame(predt_location.cpu().detach().numpy())
241 | predt_location_df.columns = [param for param in dist_params if "location_" in param]
242 |
243 | # Tril
244 | n_obs = X_test.shape[0]
245 | tril_predt = torch.cat(params_predt[n_target:],axis=1).reshape(-1, n_tril)
246 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda")
247 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt
248 |
249 | # Predicted Distribution
250 | mvn_pred = MultivariateNormal(loc=predt_location, scale_tril=predt_tril)
251 |
252 | # Sigma
253 | predt_sigma = mvn_pred.stddev.cpu().detach().numpy()
254 | predt_sigma_df = pd.DataFrame(predt_sigma)
255 | predt_sigma_df.columns = [param for param in dist_params if "scale_" in param]
256 |
257 | # Rho
258 | cov_mat = mvn_pred.covariance_matrix
259 | predt_rho = torch.cat([calc_corr(cov_mat[i]).reshape(-1, n_rho) for i in range(n_obs)],axis=0)
260 | predt_rho_df = pd.DataFrame(predt_rho.cpu().detach().numpy())
261 | predt_rho_df.columns = [param for param in dist_params if "rho_" in param]
262 |
263 | # Output DataFrame
264 | predt_params = pd.concat([predt_location_df, predt_sigma_df, predt_rho_df], axis=1)
265 |
266 | if pred_type == "parameters":
267 | return predt_params
268 |
269 | elif pred_type == "samples":
270 | torch.manual_seed(123)
271 | y_samples = mvn_pred.sample((n_samples,)).cpu().detach().numpy()
272 | return y_samples
273 |
--------------------------------------------------------------------------------
/pyboostlss/distributions/MVN_LRA.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.distributions.lowrank_multivariate_normal import LowRankMultivariateNormal
3 | import torch.optim as optim
4 | import cupy as cp
5 | import numpy as np
6 | import pandas as pd
7 | from pyboostlss.utils import *
8 |
9 |
10 |
11 | ########################################################################################################################
12 | ############################################### Multivariate Normal ##########################################
13 | ########################################################################################################################
14 |
15 | class MVN_LRA:
16 | """Multivariate Normal Distribution Class, where covariance matrix \Sigma is estimated via LRA apprixmation.
17 | """
18 |
19 | def __init__(self,
20 | r:int,
21 | D:int):
22 | self.r = r # specifies rank
23 | self.D = D # specifies target dimension
24 | self.dtype = torch.float32
25 |
26 |
27 |
28 | def initialize(self, y_true: cp.ndarray, n_target: list) -> cp.ndarray:
29 | """ Function that calculates the starting values, for each distributional parameter individually. It uses the L-BFGS algorithm for estimating unconditional parameter estimates.
30 |
31 | y_true: cp.ndarray
32 | Data from which starting values are calculated.
33 | n_target: list
34 | List that holds number of targets and rank-parameter.
35 | """
36 |
37 | torch.manual_seed(123)
38 |
39 | n_param = self.n_dist_param(n_target)
40 | n_target = self.D
41 | param_init = torch.ones(1, n_param, device="cuda", dtype=self.dtype)
42 | param_init = torch.nn.init.xavier_uniform_(param_init)
43 | param_init.requires_grad=True
44 | y_true_tens = torch.tensor(y_true[:,:n_target], device="cuda", dtype=self.dtype)
45 |
46 |
47 | def nll_init(y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=True) -> torch.tensor:
48 |
49 | n_target = self.D
50 | n_param = self.n_dist_param([self.D, self.r])
51 | rank = self.r
52 |
53 | ###
54 | # Target
55 | ###
56 | target = y_true[:,:n_target]
57 |
58 | ###
59 | # Parameters
60 | ###
61 | predt = [y_pred[:, i].reshape(-1,1) for i in range(n_param)]
62 |
63 | # Location
64 | predt_location = torch.concat(predt[:n_target],axis=1)
65 |
66 | # Low Rank Factor
67 | predt_covfactor = torch.concat(predt[n_target:(n_param-n_target)], axis=1).reshape(-1, n_target, rank) # (n_obs, n_target, rank)
68 |
69 | # Low Rank Diagonal (must be positive)
70 | predt_covdiag = predt[-n_target:]
71 | predt_covdiag = [exp_fn(predt_covdiag[i]) for i in range(len(predt_covdiag))]
72 | predt_covdiag = torch.concat(predt_covdiag, axis=1)
73 |
74 | ###
75 | # NLL
76 | ###
77 | dist_fit = LowRankMultivariateNormal(loc=predt_location, cov_factor=predt_covfactor, cov_diag=predt_covdiag, validate_args=False)
78 | nll = -torch.nansum(dist_fit.log_prob(target))
79 |
80 | return nll
81 |
82 |
83 | def closure():
84 |
85 | lbfgs.zero_grad()
86 | objective = nll_init(y_true=y_true_tens, y_pred=param_init)
87 | objective.backward()
88 |
89 | return objective
90 |
91 |
92 |
93 | lbfgs = optim.LBFGS(params=[param_init],
94 | lr=1e-03,
95 | history_size=10,
96 | max_iter=4,
97 | line_search_fn="strong_wolfe")
98 |
99 |
100 | for i in range(20):
101 | lbfgs.step(closure)
102 |
103 | start_values = cp.array(lbfgs.param_groups[0]["params"][0].cpu().detach()).reshape(-1,)
104 |
105 | return start_values
106 |
107 |
108 |
109 |
110 | # def initialize(self, y_true: cp.ndarray, n_target: list) -> cp.ndarray:
111 | # """ Function that initializes each distributional parameter with ones. Compared to the LBFGS, this is more runtime efficient.
112 | # y_true: cp.ndarray
113 | # Data from which starting values are calculated.
114 | # n_target: list
115 | # List that holds number of targets and rank-parameter.
116 | # """
117 | # n_param = self.n_dist_param(n_target)
118 | # start_values = cp.ones((n_param,))
119 |
120 | # return start_values
121 |
122 |
123 |
124 |
125 | def n_dist_param(self, n_targets: list) -> int:
126 | """Number of distributional parameters.
127 | """
128 | n_param = int(n_targets[0]*(2+n_targets[1]))
129 |
130 | return n_param
131 |
132 |
133 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray:
134 | """Function that appends target to the number of specified parameters
135 | """
136 | n_obs = y_true.shape[0]
137 | n_target = y_true.shape[1]
138 | n_fill = n_param - n_target
139 | np_fill = np.ones((n_obs, n_fill))
140 | y_append = np.concatenate([y_true, np_fill],axis=1)
141 |
142 | return y_append
143 |
144 |
145 |
146 | def tril_dim(self, n_target: int) -> int:
147 | """Infers the number of lower diagonal elements from number of targets.
148 | """
149 | n_tril = int((n_target * (n_target + 1)) / 2)
150 |
151 | return n_tril
152 |
153 |
154 | def rho_dim(self, n_target: int) -> int:
155 | """Infers the number of correlations from number of targets.
156 | """
157 | n_rho = int((n_target * (n_target - 1)) / 2)
158 | return n_rho
159 |
160 |
161 | def create_param_dict(self, n_target):
162 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions.
163 | """
164 | n_target = self.D
165 | rank = self.r
166 |
167 | # Location
168 | param_dict = {"location_" + str(i+1): identity_fn for i in range(n_target)}
169 |
170 | # Low Rank Factor
171 | lrf_dict = {"lrf_" + str(i+1): identity_fn for i in range(n_target*rank)}
172 | param_dict.update(lrf_dict)
173 |
174 | # Low Rank Diagonal
175 | lrd_dict = {"lrd_" + str(i+1): exp_fn for i in range(n_target)}
176 | param_dict.update(lrd_dict)
177 |
178 | return param_dict
179 |
180 |
181 | def param_names(self, n_target):
182 | """ List that holds the name of distributional parameter.
183 | """
184 |
185 | n_tril = self.tril_dim(n_target)
186 |
187 | # Location
188 | param_names = ["location_" + str(i+1) for i in range(n_target)]
189 |
190 | # Tril
191 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0)
192 | tril_idx = (tril_indices.detach().numpy()) + 1
193 | tril_indices_row = tril_idx[0]
194 | tril_indices_col = tril_idx[1]
195 | tril_diag = tril_idx[0] == tril_idx[1]
196 |
197 | for i in range(n_tril):
198 | if tril_diag[i] == True:
199 | param_names.append("scale_" + str(tril_idx[:,i][1]))
200 | else:
201 | param_names.append("rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]))
202 |
203 | return param_names
204 |
205 |
206 |
207 |
208 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor:
209 | """ Returns estimated parameters and nll.
210 |
211 | Args:
212 | y_true: cp.ndarray, Input target variables
213 | y_pred: cp.ndarray, predictions
214 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation
215 |
216 | Returns:
217 | predt, nll
218 | """
219 |
220 | ###
221 | # Initialize
222 | ###
223 | n_obs = y_true.shape[0]
224 | n_param = y_true.shape[1]
225 | n_target = self.D
226 | rank = self.r
227 | param_dict = self.create_param_dict(n_target)
228 |
229 |
230 | ###
231 | # Target
232 | ###
233 | target = torch.as_tensor(y_true[:,:n_target], device="cuda", dtype=self.dtype).reshape(-1, n_target)
234 |
235 |
236 | ###
237 | # Parameters
238 | ###
239 | predt = [torch.tensor(np.nan_to_num(y_pred[:, i], nan=float(np.nanmean(y_pred[:, i]))), device="cuda", requires_grad=requires_grad, dtype=self.dtype).reshape(-1,1) for i in range(n_param)]
240 |
241 | # Location
242 | predt_location = torch.concat(predt[:n_target],axis=1)
243 |
244 | # Low Rank Factor
245 | predt_covfactor = torch.concat(predt[n_target:(n_param-n_target)], axis=1).reshape(-1, n_target, rank) # (n_obs, n_target, rank)
246 |
247 | # Low Rank Diagonal (must be positive)
248 | predt_covdiag = predt[-n_target:]
249 | predt_covdiag = [exp_fn(predt_covdiag[i]) for i in range(len(predt_covdiag))]
250 | predt_covdiag = torch.concat(predt_covdiag, axis=1)
251 |
252 |
253 | ###
254 | # NLL
255 | ###
256 | dist_fit = LowRankMultivariateNormal(loc=predt_location, cov_factor=predt_covfactor, cov_diag=predt_covdiag, validate_args=False)
257 | nll = -torch.nansum(dist_fit.log_prob(target))
258 |
259 | return predt, nll
260 |
261 |
262 |
263 |
264 | def predict(self,
265 | model,
266 | X_test: np.array,
267 | pred_type: str = "parameters",
268 | n_samples: int = 100
269 | ):
270 | """
271 | Predict function.
272 |
273 | model:
274 | Instance of pyboostlss
275 | X_test: np.array
276 | Test data features
277 | pred_type: str
278 | Specifies what is to be predicted:
279 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target)
280 | "parameters": returns the predicted distributional parameters.
281 | n_samples: int
282 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution.
283 | Returns
284 | -------
285 | pd.DataFrame with n_samples drawn from predicted response distribution.
286 |
287 | """
288 |
289 | n_obs = X_test.shape[0]
290 | n_target = self.D
291 | rank = self.r
292 | n_param = self.n_dist_param([n_target, rank])
293 | n_rho = self.rho_dim(n_target)
294 | param_dict = self.create_param_dict(n_target)
295 | dist_params = self.param_names(n_target)
296 |
297 | # Predicted parameters
298 | params_predt = torch.tensor(model.predict(X_test), device="cuda")
299 | params_predt = [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())]
300 |
301 |
302 | # Location
303 | predt_location = torch.cat(params_predt[:n_target],axis=1)
304 | predt_location_df = pd.DataFrame(predt_location.cpu().detach().numpy())
305 | predt_location_df.columns = [param for param in dist_params if "location_" in param]
306 |
307 | # Low Rank Factor
308 | predt_covfactor = torch.cat(params_predt[n_target:(n_param-n_target)], axis=1).reshape(-1, n_target, rank) # (n_obs, n_target, rank)
309 |
310 | # Low Rank Diagonal
311 | predt_covdiag = torch.cat(params_predt[-n_target:], axis=1)
312 |
313 | # Predicted Distribution
314 | mvn_lra_pred = LowRankMultivariateNormal(loc=predt_location, cov_factor=predt_covfactor, cov_diag=predt_covdiag, validate_args=False)
315 |
316 | # Sigma
317 | predt_sigma = mvn_lra_pred.stddev.cpu().detach().numpy()
318 | predt_sigma_df = pd.DataFrame(predt_sigma)
319 | predt_sigma_df.columns = [param for param in dist_params if "scale_" in param]
320 |
321 | # Rho
322 | cov_mat = mvn_lra_pred.covariance_matrix
323 | predt_rho = torch.cat([calc_corr(cov_mat[i]).reshape(-1, n_rho) for i in range(n_obs)],axis=0)
324 | predt_rho_df = pd.DataFrame(predt_rho.cpu().detach().numpy())
325 | predt_rho_df.columns = [param for param in dist_params if "rho_" in param]
326 |
327 | # Output DataFrame
328 | params_df = pd.concat([predt_location_df, predt_sigma_df, predt_rho_df], axis=1)
329 |
330 | if pred_type == "parameters":
331 | return params_df
332 |
333 | elif pred_type == "samples":
334 | torch.manual_seed(123)
335 | y_samples = mvn_lra_pred.sample((n_samples,)).cpu().detach().numpy()
336 | return y_samples
337 |
--------------------------------------------------------------------------------
/pyboostlss/distributions/MVT.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from pyro.distributions import MultivariateStudentT
3 | from scipy.stats import t
4 | import cupy as cp
5 | import numpy as np
6 | import pandas as pd
7 | from pyboostlss.utils import *
8 |
9 |
10 |
11 | ########################################################################################################################
12 | ############################################### Multivariate Student-T ##########################################
13 | ########################################################################################################################
14 |
15 | class MVT:
16 | """Multivariate Student-T Distribution Class, where covariance matrix \Sigma is estimated via Cholesky-decomposition
17 | \Sigma = LL`.
18 | """
19 |
20 | def __init__(self,D:int):
21 | self.D = D # specifies target dimension
22 |
23 |
24 | def initialize(self, y_true: cp.ndarray, n_target: int) -> cp.ndarray:
25 | """ Function that calculates the starting values, for each distributional parameter individually.
26 | y_true: cp.ndarray
27 | Data from which starting values are calculated.
28 | n_target: ndarray
29 | Number of target variables
30 | """
31 | # Target
32 | target = y_true[:,:n_target]
33 |
34 | # Fitted Student-T Parameters
35 | student_param = cp.array([t.fit(cp.asnumpy(target[:,i])) for i in range(n_target)])
36 |
37 | # Df
38 | df_init = (cp.log(2) + cp.log(cp.median(student_param[:,0]))).reshape(-1,)
39 |
40 | # Location
41 | loc_init = cp.mean(target,axis=0)
42 |
43 | # Tril
44 | tril_indices = cp.asarray(np.tril_indices(n_target))
45 | tril_init = cp.cov(target,rowvar=False)
46 | tril_init = cp.linalg.cholesky(tril_init)
47 | cp.fill_diagonal(tril_init, cp.log(cp.diagonal(tril_init)))
48 | tril_init = tril_init[tril_indices[0], tril_indices[1]]
49 |
50 | start_values = cp.concatenate([df_init, loc_init, tril_init])
51 |
52 | return start_values
53 |
54 |
55 |
56 | def n_dist_param(self, n_targets: int) -> int:
57 | """Infers the number of distributional parameters from target dimension.
58 | """
59 | n_param = int(1 + ((n_targets*(n_targets + 3))/2))
60 |
61 | return n_param
62 |
63 |
64 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray:
65 | """Function that appends target to the number of specified parameters
66 | """
67 | n_obs = y_true.shape[0]
68 | n_target = y_true.shape[1]
69 | n_fill = n_param - n_target
70 | np_fill = np.ones((n_obs, n_fill))
71 | y_append = np.concatenate([y_true, np_fill],axis=1)
72 |
73 | return y_append
74 |
75 |
76 | def tril_dim(self, n_target: int) -> int:
77 | """Infers the number of lower diagonal elements from number of targets.
78 | """
79 | n_tril = int((n_target * (n_target + 1)) / 2)
80 |
81 | return n_tril
82 |
83 |
84 | def rho_dim(self, n_target: int) -> int:
85 | """Infers the number of correlations from number of targets.
86 | """
87 | n_rho = int((n_target * (n_target - 1)) / 2)
88 | return n_rho
89 |
90 |
91 |
92 | def create_param_dict(self, n_target, tril_indices):
93 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions.
94 | """
95 |
96 | n_theta = self.n_dist_param(n_target)
97 | n_tril = self.tril_dim(n_target)
98 |
99 | # Df
100 | param_dict = {"df": exp_fn_df}
101 |
102 | # Location
103 | loc_dict = {"location_" + str(i+1): identity_fn for i in range(n_target)}
104 |
105 | param_dict.update(loc_dict)
106 |
107 | # Tril
108 | tril_idx = (tril_indices.detach().numpy()) + 1
109 | tril_indices_row = tril_idx[0]
110 | tril_indices_col = tril_idx[1]
111 | tril_diag = tril_idx[0] == tril_idx[1]
112 |
113 | tril_dict = {}
114 |
115 | for i in range(n_tril):
116 | if tril_diag[i] == True:
117 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn})
118 | else:
119 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn})
120 |
121 | param_dict.update(tril_dict)
122 |
123 | return param_dict
124 |
125 |
126 | def create_tril_dict(self, n_target, tril_indices):
127 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions.
128 | """
129 |
130 | n_theta = self.n_dist_param(n_target)
131 | n_tril = self.tril_dim(n_target)
132 |
133 | # Tril
134 | tril_idx = (tril_indices.detach().numpy()) + 1
135 | tril_indices_row = tril_idx[0]
136 | tril_indices_col = tril_idx[1]
137 | tril_diag = tril_idx[0] == tril_idx[1]
138 |
139 | tril_dict = {}
140 |
141 | for i in range(n_tril):
142 | if tril_diag[i] == True:
143 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn})
144 | else:
145 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn})
146 |
147 | return tril_dict
148 |
149 |
150 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor:
151 | """ Returns estimated parameters and nll.
152 |
153 | Args:
154 | y_true: cp.ndarray, Input target variables
155 | y_pred: cp.ndarray, predictions
156 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation
157 |
158 | Returns:
159 | predt, nll
160 | """
161 |
162 | ###
163 | # Initialize
164 | ###
165 | n_obs = y_true.shape[0]
166 | n_param = y_true.shape[1]
167 | n_target = self.D
168 | n_tril = self.tril_dim(n_target)
169 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0)
170 | param_dict = self.create_param_dict(n_target,tril_indices)
171 | tril_param_dict = self.create_tril_dict(n_target,tril_indices)
172 |
173 |
174 | ###
175 | # Target
176 | ###
177 | target = torch.as_tensor(y_true[:,:n_target], device="cuda").reshape(-1, n_target)
178 |
179 |
180 |
181 | ###
182 | # Parameters
183 | ###
184 | predt = [torch.tensor(y_pred[:,i].reshape(-1,1), device="cuda", requires_grad=requires_grad) for i in range(n_param)]
185 |
186 | # Df
187 | predt_df = exp_fn_df(predt[0]).reshape(-1,)
188 |
189 | # Location
190 | predt_location = torch.concat(predt[1:(n_target+1)],axis=1)
191 |
192 | # Tril
193 | tril_predt = predt[(n_target+1):]
194 | tril_predt = [response_fun(tril_predt[i]) for i, (dist_param, response_fun) in enumerate(tril_param_dict.items())]
195 | tril_predt = torch.concat(tril_predt,axis=1)
196 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda")
197 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt
198 |
199 |
200 | ###
201 | # NLL
202 | ###
203 | dist_fit = MultivariateStudentT(predt_df, predt_location, predt_tril)
204 | nll = -torch.nansum(dist_fit.log_prob(target))
205 |
206 | return predt, nll
207 |
208 |
209 | def predict(self,
210 | model,
211 | X_test: np.array,
212 | pred_type: str = "parameters",
213 | n_samples: int = 100
214 | ):
215 | """
216 | Predict function.
217 |
218 | model:
219 | Instance of pyboostlss
220 | X_test: np.array
221 | Test data features
222 | pred_type: str
223 | Specifies what is to be predicted:
224 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target)
225 | "parameters": returns the predicted distributional parameters.
226 | n_samples: int
227 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution.
228 | Returns
229 | -------
230 | pd.DataFrame with n_samples drawn from predicted response distribution.
231 |
232 | """
233 |
234 | n_target = self.D
235 | n_tril = self.tril_dim(n_target)
236 | n_rho = self.rho_dim(n_target)
237 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0)
238 | param_dict = self.create_param_dict(n_target,tril_indices)
239 | dist_params = list(param_dict.keys())
240 |
241 | # Predicted parameters
242 | params_predt = torch.tensor(model.predict(X_test), device="cuda")
243 | params_predt = [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())]
244 |
245 | # Df
246 | predt_df = params_predt[0].reshape(-1,)
247 | predt_df_pd = pd.DataFrame(predt_df.cpu().detach().numpy())
248 | predt_df_pd.columns = ["df"]
249 |
250 | # Location
251 | predt_location = torch.cat(params_predt[1:(n_target+1)],axis=1)
252 | predt_location_df = pd.DataFrame(predt_location.cpu().detach().numpy())
253 | predt_location_df.columns = [param for param in dist_params if "location_" in param]
254 |
255 | # Tril
256 | n_obs = X_test.shape[0]
257 | tril_predt = torch.cat(params_predt[(n_target+1):],axis=1).reshape(-1, n_tril)
258 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda")
259 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt
260 |
261 | # Predicted Distribution
262 | mvt_pred = MultivariateStudentT(predt_df, predt_location, predt_tril)
263 |
264 | # Sigma
265 | predt_sigma = mvt_pred.stddev.cpu().detach().numpy()
266 | predt_sigma_df = pd.DataFrame(predt_sigma)
267 | predt_sigma_df.columns = [param for param in dist_params if "scale_" in param]
268 |
269 | # Rho
270 | cov_mat = mvt_pred.covariance_matrix
271 | predt_rho = torch.cat([calc_corr(cov_mat[i]).reshape(-1, n_rho) for i in range(n_obs)],axis=0)
272 | predt_rho_df = pd.DataFrame(predt_rho.cpu().detach().numpy())
273 | predt_rho_df.columns = [param for param in dist_params if "rho_" in param]
274 |
275 |
276 | # Output DataFrame
277 | predt_params = pd.concat([predt_df_pd, predt_location_df, predt_sigma_df, predt_rho_df], axis=1)
278 |
279 | if pred_type == "parameters":
280 | return predt_params
281 |
282 | elif pred_type == "samples":
283 | torch.manual_seed(123)
284 | y_samples = mvt_pred.sample((n_samples,)).cpu().detach().numpy()
285 | return y_samples
286 |
287 |
--------------------------------------------------------------------------------
/pyboostlss/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | """Py-BoostLSS - An extension of Py-Boost to probabilistic modelling"""
2 |
3 | from pyboostlss.distributions.distribution_loss_metric import *
4 | from pyboostlss.distributions.MVN import *
5 | from pyboostlss.distributions.MVN_LRA import *
6 | from pyboostlss.distributions.MVT import *
7 | from pyboostlss.distributions.DIRICHLET import *
8 |
9 |
--------------------------------------------------------------------------------
/pyboostlss/distributions/distribution_loss_metric.py:
--------------------------------------------------------------------------------
1 | import cupy as cp
2 | from pyboostlss.utils import *
3 | from py_boost.gpu.losses import Loss, Metric
4 |
5 |
6 |
7 | class Distribution_Metric(Metric):
8 |
9 |
10 | def __init__(self, dist):
11 | self.dist = dist
12 |
13 |
14 | alias = "NLL-score"
15 |
16 |
17 | def error(self, y_true, y_pred):
18 | """Error metric definition.
19 | Args:
20 | y_true: cp.array, targets
21 | y_pred: cp.array, predictions
22 | sample_weight: None or cp.ndarray, weights
23 | Returns:
24 | float, metric value
25 | """
26 |
27 | _, nll = self.dist.get_params_nll(y_true, y_pred)
28 | nll = cp.asarray(nll)
29 |
30 | return nll
31 |
32 |
33 | def compare(self, v0 ,v1):
34 | """
35 | It should return True if v0 metric value is better than v1, False othewise
36 | """
37 | return v0 < v1
38 |
39 |
40 | def __call__(self, y_true, y_pred, sample_weight=None):
41 | """Full metric definition.
42 | Args:
43 | y_true: cp.array, targets
44 | y_pred: cp.array, predictions
45 | sample_weight: None or cp.ndarray, weights
46 | Returns:
47 | float, metric value
48 | """
49 |
50 | err = self.error(y_true, y_pred)
51 |
52 | return err
53 |
54 |
55 |
56 |
57 | class Distribution_Loss(Loss):
58 |
59 | def __init__(self, dist):
60 | self.dist = dist
61 |
62 | def get_grad_hess(self, y_true, y_pred):
63 | """
64 | Defines how to calculate gradients and hessians for given loss.
65 | Args:
66 | y_true: cp.array, targets
67 | y_pred: cp.array, predictions
68 | sample_weight: None or cp.ndarray, weights
69 | Returns:
70 | floats, grad, hess
71 | """
72 |
73 | ###
74 | # Parameters and NLL
75 | ###
76 | predt, nll = self.dist.get_params_nll(y_true, y_pred, requires_grad=True)
77 |
78 |
79 | ###
80 | # Derivatives
81 | ###
82 | grad, hess = get_derivs(nll, predt)
83 |
84 | return grad, hess
85 |
86 |
87 |
88 | def base_score(self, y_true):
89 | """
90 | Defines how parameter estimates are initialized.
91 | Args:
92 | y_true: cp.array, targets
93 | Returns:
94 | floats, base_margins
95 | """
96 |
97 | if hasattr(self.dist, "r"):
98 | n_target = [self.dist.D, self.dist.r]
99 | else:
100 | n_target = self.dist.D
101 | base_margin = self.dist.initialize(y_true, n_target)
102 |
103 | return base_margin
--------------------------------------------------------------------------------
/pyboostlss/model.py:
--------------------------------------------------------------------------------
1 | from pyboostlss.distributions.distribution_loss_metric import *
2 | from pyboostlss.utils import *
3 | from py_boost import SketchBoost
4 |
5 | import optuna
6 | from optuna.samplers import TPESampler
7 |
8 | class PyBoostLSS:
9 | """
10 | Py-BoostLSS model class. Currently only supports SketchBoost algorithm.
11 |
12 | """
13 |
14 | def __init__(self, dist):
15 | self.dist = dist # pyboostlss.distributions class. Specifies distribution
16 |
17 |
18 | def train(self,
19 | dtrain=None,
20 | eval_sets=None,
21 | ntrees=100,
22 | lr=0.05,
23 | min_gain_to_split=0,
24 | lambda_l2=1,
25 | gd_steps=1,
26 | max_depth=6,
27 | min_data_in_leaf=10,
28 | colsample=1.,
29 | subsample=1.,
30 |
31 | quantization='Quantile',
32 | quant_sample=2000000,
33 | max_bin=256,
34 | min_data_in_bin=3,
35 |
36 | es=100,
37 | seed=123,
38 | verbose=10,
39 |
40 | sketch_outputs=1,
41 | sketch_method="proj",
42 | use_hess=True,
43 |
44 | callbacks=None,
45 | sketch_params=None):
46 |
47 | """Train a pyboostlss model with given parameters.
48 |
49 | Parameters
50 | ----------
51 | dtrain: dict, Dataset used for training of the form {'X': X_train, 'y': X_train}
52 | eval_sets: list used to evaluate model during training, e.g., [{'X': X_train, 'y': X_train}]
53 | ntrees: int, maximum number of trees
54 | lr: float, learning rate
55 | min_gain_to_split: float >=0, minimal gain to split
56 | lambda_l2: float > 0, l2 leaf regularization
57 | gd_steps: int > 0, number of gradient steps
58 | max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets
59 | min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated
60 | with hessian values to speed up training
61 | colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling
62 | subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling
63 | quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform',
64 | 'Uniquant' or custom implementation
65 | quant_sample: int, subsample to quantize features
66 | max_bin: int in [2, 256] maximum number of bins to quantize features
67 | min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored
68 | es: int, early stopping rounds. If 0, no early stopping
69 | seed: int, random state
70 | verbose: int, verbosity freq
71 | sketch_outputs: int, number of outputs to keep
72 | sketch_method: str, name of the sketching strategy. Currently the following options are available: "topk", "rand", "proj".
73 | use_hess: bool, use hessians in multioutput training
74 | callbacks: list of Callback, callbacks to customize training are passed here
75 | sketch_params: dict, optional kwargs for sketching strategy
76 |
77 | """
78 |
79 | bstLSS_init = SketchBoost(loss=Distribution_Loss(self.dist),
80 | metric=Distribution_Metric(self.dist),
81 | ntrees=ntrees,
82 | lr=lr,
83 | min_gain_to_split=min_gain_to_split,
84 | lambda_l2=lambda_l2,
85 | gd_steps=gd_steps,
86 | max_depth=max_depth,
87 | min_data_in_leaf=min_data_in_leaf,
88 | colsample=colsample,
89 | subsample=subsample,
90 |
91 | quantization=quantization,
92 | quant_sample=quant_sample,
93 | max_bin=max_bin,
94 | min_data_in_bin=min_data_in_bin,
95 |
96 | es=es,
97 | seed=seed,
98 | verbose=verbose,
99 |
100 | sketch_outputs=sketch_outputs,
101 | sketch_method=sketch_method,
102 | use_hess=use_hess,
103 |
104 | callbacks=callbacks,
105 | sketch_params=sketch_params
106 | )
107 |
108 |
109 | # Append Target
110 | if hasattr(self.dist, "r"):
111 | n_target = [self.dist.D, self.dist.r]
112 | else:
113 | n_target = self.dist.D
114 |
115 | y_train_append = self.dist.target_append(dtrain["y"], self.dist.n_dist_param(n_target))
116 |
117 | if eval_sets is not None:
118 | y_eval_append = self.dist.target_append(eval_sets[0]["y"] , self.dist.n_dist_param(n_target))
119 | eval_sets_append = eval_sets.copy()
120 | eval_sets_append[0]["y"] = y_eval_append
121 |
122 | else:
123 | eval_sets_append = None
124 |
125 |
126 | bstLSS_train = bstLSS_init.fit(dtrain["X"], y_train_append, eval_sets=eval_sets_append)
127 |
128 | return bstLSS_train
129 |
130 |
131 |
132 |
133 |
134 |
135 | def hyper_opt(self,
136 | params=None,
137 | dtrain=None,
138 | eval_sets=None,
139 | ntrees=100,
140 | lr=0.05,
141 | min_gain_to_split=0,
142 | lambda_l2=1,
143 | gd_steps=1,
144 | max_depth=6,
145 | min_data_in_leaf=10,
146 | colsample=1.,
147 | subsample=1.,
148 |
149 | quantization='Quantile',
150 | quant_sample=2000000,
151 | max_bin=256,
152 | min_data_in_bin=3,
153 |
154 | es=100,
155 | seed=123,
156 | hp_seed=None,
157 | verbose=int(1e04),
158 |
159 | sketch_outputs=1,
160 | sketch_method="proj",
161 | use_hess=True,
162 |
163 | callbacks=None,
164 | sketch_params=None,
165 |
166 | max_minutes=120,
167 | n_trials=None,
168 | study_name=None,
169 | silence=False
170 | ):
171 |
172 | """Function to tune hyper-parameters using Optuna.
173 |
174 | Parameters
175 | ----------
176 | params: dict, tunable hyper-parameters and their ranges
177 | dtrain: dict, Dataset used for training of the form {'X': X_train, 'y': X_train}
178 | eval_sets: list used to evaluate model during training, e.g., [{'X': X_train, 'y': X_train}]
179 | ntrees: int, maximum number of trees
180 | lr: float, learning rate
181 | min_gain_to_split: float >=0, minimal gain to split
182 | lambda_l2: float > 0, l2 leaf regularization
183 | gd_steps: int > 0, number of gradient steps
184 | max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets
185 | min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated
186 | with hessian values to speed up training
187 | colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling
188 | subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling
189 | quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform',
190 | 'Uniquant' or custom implementation
191 | quant_sample: int, subsample to quantize features
192 | max_bin: int in [2, 256] maximum number of bins to quantize features
193 | min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored
194 | es: int, early stopping rounds. If 0, no early stopping
195 | seed: int, random state
196 | hp_seed: int, Random state for random number generator used in the Bayesian hyper-parameter search
197 | verbose: int, verbosity freq
198 | sketch_outputs: int, number of outputs to keep
199 | sketch_method: str, name of the sketching strategy. Currently the following options are available: "topk", "rand", "proj".
200 | use_hess: bool, use hessians in multioutput training
201 | callbacks: list of Callback, callbacks to customize training are passed here
202 | sketch_params: dict, optional kwargs for sketching strategy
203 | max_minutes: int, Time budget in minutes, i.e., stop study after the given number of minutes.
204 | n_trials: int, The number of trials. If this argument is set to None, there is no limitation on the number of trials.
205 | study_name : str, Name of the hyperparameter study.
206 | silence: bool, Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
207 |
208 | Returns
209 | -------
210 | opt_params : Dict() with optimal parameters.
211 | """
212 |
213 | def objective(trial):
214 |
215 | hyper_params = {
216 | "lr": trial.suggest_float("lr", params["lr"][0], params["lr"][1]),
217 | "max_depth": trial.suggest_int("max_depth", params["max_depth"][0], params["max_depth"][1]),
218 | "sketch_outputs": trial.suggest_int("sketch_outputs", params["sketch_outputs"][0], params["sketch_outputs"][1]),
219 | "lambda_l2": trial.suggest_float("lambda_l2", params["lambda_l2"][0], params["lambda_l2"][1]),
220 | "colsample": trial.suggest_float("colsample", params["colsample"][0], params["colsample"][1]),
221 | "subsample": trial.suggest_float("subsample", params["subsample"][0], params["subsample"][1]),
222 | "min_gain_to_split": trial.suggest_float("min_gain_to_split", params["min_gain_to_split"][0], params["min_gain_to_split"][1])
223 | }
224 |
225 | bstLSS_cv = self.train(dtrain=dtrain,
226 | eval_sets=eval_sets,
227 | ntrees=ntrees,
228 | lr=hyper_params["lr"],
229 | min_gain_to_split=hyper_params["min_gain_to_split"],
230 | lambda_l2=hyper_params["lambda_l2"],
231 | gd_steps=gd_steps,
232 | max_depth=hyper_params["max_depth"],
233 | min_data_in_leaf=min_data_in_leaf,
234 | colsample=hyper_params["colsample"],
235 | subsample=hyper_params["subsample"],
236 |
237 | quantization=quantization,
238 | quant_sample=quant_sample,
239 | max_bin=max_bin,
240 | min_data_in_bin=min_data_in_bin,
241 |
242 | es=es,
243 | seed=seed,
244 | verbose=verbose,
245 |
246 | sketch_outputs=hyper_params["sketch_outputs"],
247 | sketch_method=sketch_method,
248 | use_hess=use_hess,
249 |
250 | callbacks=callbacks,
251 | sketch_params=sketch_params
252 | )
253 |
254 |
255 | # Add optimal rounds
256 | opt_rounds = bstLSS_cv.best_round
257 | trial.set_user_attr("opt_round", int(opt_rounds))
258 |
259 | # Extract the best score
260 | y_true = eval_sets[0]["y"]
261 | y_pred = bstLSS_cv.predict(eval_sets[0]["X"])
262 | _, nll = self.dist.get_params_nll(y_true, y_pred)
263 | best_score = cp.asarray(nll)
264 |
265 | # Replace 0 value
266 | best_score = cp.where(best_score == -0.0, 1e08, best_score)
267 |
268 | return best_score
269 |
270 |
271 | if silence:
272 | optuna.logging.set_verbosity(optuna.logging.WARNING)
273 |
274 | if study_name is None:
275 | study_name = "Py-BoostLSS Hyper-Parameter Optimization"
276 |
277 | if hp_seed is not None:
278 | sampler = TPESampler(seed=hp_seed)
279 | else:
280 | sampler = TPESampler()
281 |
282 | pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=20)
283 | study = optuna.create_study(sampler=sampler, pruner=pruner, direction="minimize", study_name=study_name)
284 | study.optimize(objective, n_trials=n_trials, timeout=60 * max_minutes, show_progress_bar=True)
285 |
286 |
287 | print("\nHyper-Parameter Optimization successfully finished.")
288 | print(" Number of finished trials: ", len(study.trials))
289 | print(" Best trial:")
290 | opt_param = study.best_trial
291 |
292 | # Add optimal stopping round
293 | opt_param.params["opt_rounds"] = study.trials_dataframe()["user_attrs_opt_round"][
294 | study.trials_dataframe()["value"].idxmin()]
295 | opt_param.params["opt_rounds"] = int(opt_param.params["opt_rounds"])
296 |
297 | print(" Value: {}".format(opt_param.value))
298 | print(" Params: ")
299 | for key, value in opt_param.params.items():
300 | print(" {}: {}".format(key, value))
301 |
302 | return opt_param.params
303 |
--------------------------------------------------------------------------------
/pyboostlss/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import grad as autograd
3 | import cupy as cp
4 | import numpy as np
5 |
6 | ###
7 | # Response Functions
8 | ###
9 |
10 | def identity_fn(predt: torch.tensor) -> torch.tensor:
11 | """Identity mapping of predt.
12 | """
13 | return predt
14 |
15 |
16 | def exp_fn(predt: torch.tensor) -> torch.tensor:
17 | """Exp() function used to ensure predt is strictly positive.
18 | """
19 | predt_adj = torch.exp(predt)
20 | predt_adj = torch.nan_to_num(predt_adj, nan=float(torch.nanmean(predt_adj))) + torch.tensor(1e-6, dtype=predt_adj.dtype, device="cuda")
21 |
22 | return predt_adj
23 |
24 |
25 | def exp_fn_df(predt: torch.tensor) -> torch.tensor:
26 | """Exp() function for StudentT df-paramter used to ensure predt is strictly positive.
27 | """
28 | predt_adj = torch.exp(predt) + torch.tensor(2.0, device="cuda")
29 | predt_adj = torch.nan_to_num(predt_adj, nan=float(torch.nanmean(predt_adj))) + torch.tensor(1e-6, dtype=predt_adj.dtype, device="cuda")
30 |
31 | return predt_adj
32 |
33 |
34 |
35 |
36 | ###
37 | # Autograd Function
38 | ###
39 | def get_derivs(nll: torch.tensor, predt: torch.tensor) -> cp.ndarray:
40 | """ Calculates gradients and hessians.
41 |
42 | Output gradients and hessians have shape (n_samples, n_outputs).
43 |
44 | Args:
45 | nll: torch.tensor, calculated NLL
46 | predt: torch.tensor, list of predicted paramters
47 |
48 | Returns:
49 | grad, hess
50 | """
51 |
52 | # Gradient and Hessian
53 | grad_list = autograd(nll, inputs=predt, create_graph=True)
54 | hess_list = [autograd(grad_list[i].nansum(), inputs=predt[i], retain_graph=True)[0] for i in range(len(grad_list))]
55 |
56 | # Reshape
57 | grad = cp.asarray(torch.concat(grad_list,axis=1).detach())
58 | hess = cp.asarray(torch.concat(hess_list,axis=1).detach())
59 |
60 | return grad, hess
61 |
62 |
63 |
64 |
65 | ###
66 | # Misc
67 | ###
68 |
69 | def response_dim(y_true: int) -> int:
70 | """Infers the number of targets from input dataset.
71 | """
72 | n_obs = y_true.shape[0]
73 | col_sums = y_true.sum(axis=0)
74 | n_target = col_sums != n_obs
75 | n_target = len(n_target[n_target == True])
76 |
77 | return n_target
78 |
79 |
80 | def calc_corr(cov_mat: torch.tensor) -> torch.tensor:
81 | """Calculates the lower correlation matrix from covariance matrix.
82 | """
83 | diag = torch.sqrt(torch.diag(torch.diag(cov_mat)))
84 | diag_inv = torch.linalg.inv(diag)
85 | cor_mat = diag_inv @ cov_mat @ diag_inv
86 | cor_mat = cor_mat[np.tril_indices_from(cor_mat, k=-1)]
87 |
88 | return cor_mat
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 |
4 | setup(
5 | name="pyboostlss",
6 | version="0.1.0",
7 | description="Py-BoostLSS: An extension of Py-Boost to probabilistic modelling",
8 | long_description=open("README.md").read(),
9 | long_description_content_type="text/markdown",
10 | author="Alexander März",
11 | author_email="alex.maerz@gmx.net",
12 | url="https://github.com/StatMixedML/Py-BoostLSS",
13 | license="Apache License 2.0",
14 | packages=find_packages(exclude=["tests"]),
15 | include_package_data=True,
16 | package_data={'': ['datasets/*.csv']},
17 | zip_safe=True,
18 | python_requires=">=3.8, <3.10",
19 | install_requires=[
20 | "py-boost~=0.3.0",
21 | "optuna~=3.0.3",
22 | "pyro-ppl~=1.8.3",
23 | "dirichlet~=0.9",
24 | "scikit-learn~=1.1.3",
25 | "numpy~=1.23.5",
26 | "pandas~=1.5.2",
27 | "plotnine~=0.10.1",
28 | "scipy~=1.8.1",
29 | "tqdm~=4.64.1",
30 | "matplotlib~=3.6.2",
31 | "ipywidgets~=8.0.2",
32 | ],
33 | test_suite="tests",
34 | tests_require=["flake8", "pytest"],
35 | )
--------------------------------------------------------------------------------