├── .gitignore ├── LICENSE ├── README.md ├── examples ├── Dirichlet.ipynb ├── MVN (Cholesky).ipynb ├── MVN (LRA).ipynb └── MVT (Cholesky).ipynb ├── pyboostlss ├── __init__.py ├── datasets │ ├── __init__.py │ ├── arcticlake.csv │ ├── data_loader.py │ ├── sim_triv_gaussian.csv │ └── sim_triv_student.csv ├── distributions │ ├── DIRICHLET.py │ ├── MVN.py │ ├── MVN_LRA.py │ ├── MVT.py │ ├── __init__.py │ └── distribution_loss_metric.py ├── model.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Pycharm 2 | .idea/ 3 | dist/ 4 | latex_distributions 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | 9 | 10 | # Jupyter Notebook 11 | .ipynb_checkpoints 12 | 13 | # General 14 | Lib/ 15 | Scripts/ 16 | pyvenv.cfg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Py-BoostLSS: An extension of Py-Boost to probabilistic modelling 2 | 3 | We present a probabilistic extension of the recently introduced [Py-Boost](https://github.com/sb-ai-lab/Py-Boost) approach and model all moments of a parametric multivariate distribution as functions of covariates. This allows us to create probabilistic predictions from which intervals and quantiles of interest can be derived. 4 | 5 | ## Motivation 6 | 7 | Existing implementations of Gradient Boosting Machines, such as [XGBoost](https://github.com/dmlc/xgboost) and [LightGBM](https://github.com/microsoft/LightGBM), are mostly designed for single-target regression tasks. While efficient for low to medium target-dimensions, the computational cost of estimating them becomes prohibitive in high-dimensional settings. 8 | 9 | As an example, consider modelling a multivariate Gaussian distribution with `D=100` target variables, where the covariance matrix is approximated using the Cholesky-Decomposition. Modelling all conditional moments (i.e., means, standard-deviations and all pairwise correlations) requires estimation of `D(D + 3)/2 = 5,150` parameters. Because most GBM implementations are based on a *one vs. all estimation strategy*, where a separate tree is grown for each parameter, estimating this many parameters for a large dataset can become computationally extremely expensive. 10 | 11 | The recently introduced [Py-Boost](https://github.com/sb-ai-lab/Py-Boost) approach provides a more runtime efficient GBM implementation, making it a good candidate for estimating high-dimensional target variables in a probabilistic setting. Borrowing from the original paper [SketchBoost: Fast Gradient Boosted Decision Tree for Multioutput Problems](https://openreview.net/forum?id=WSxarC8t-T), the following figure illustrates the runtime-efficiency of the Py-Boost model. 12 | 13 |

14 | 15 |

16 | 17 | Even though the original implementation of Py-Boost also supports estimation of univariate responses, Py-BoostLSS focuses on multi-target probabilistic regression settings. For univariate probabilistic GBMs, we refer to our implementations of [XGBoostLSS](https://github.com/StatMixedML/XGBoostLSS) and [LightGBMLSS](https://github.com/StatMixedML/LightGBMLSS). 18 | 19 | ## Installation 20 | 21 | Since Py-BoostLSS is entirely GPU-based, we first need to install the corresponding PyTorch and CuPy packages. If you are on Windows, it is preferable to install CuPy via conda. All other OS can use pip. You can check your cuda version with `nvcc --version`. 22 | 23 | ```python 24 | # CuPy (replace with your cuda version) 25 | # Windows only 26 | conda install -c conda-forge cupy cudatoolkit=11.x 27 | # Others 28 | pip install cupy-cuda11x 29 | 30 | # PyTorch (replace with your cuda version) 31 | pip3 install torch --extra-index-url https://download.pytorch.org/whl/cu11x 32 | ``` 33 | 34 | Next, you can install Py-BoostLSS. 35 | 36 | ```python 37 | pip install git+https://github.com/StatMixedML/Py-BoostLSS.git 38 | ``` 39 | 40 | ## How to use 41 | We refer to the [examples section](https://github.com/StatMixedML/Py-BoostLSS/tree/main/examples) for example notebooks. 42 | 43 | ## Available Distributions 44 | Py-BoostLSS currently supports the following distributions. More distribution follow soon. 45 | 46 | | Distribution | Usage |Type | Support 47 | | :----------------------------------------------------------: |:--------------: |:--------------------------------: | :-----------------------: | 48 | | Multivariate Normal
(Cholesky) | `MVN()` | Continous
(Multivariate) | $y \in (-\infty,\infty)$ | 49 | | Multivariate Normal
(Low-Rank Approximation) | `MVN_LRA()` | Continous
(Multivariate) | $y \in (-\infty,\infty)$ | 50 | | Multivariate Student-T
(Cholesky) | `MVT()` | Continous
(Multivariate) | $y \in (-\infty,\infty)$ | 51 | | Dirichlet | `DIRICHLET()` | Continous
(Multivariate) | $y \in [0,1]$ | 52 | 53 | 54 | 60 | 61 | 62 | 63 | 64 | ## Feedback 65 | Please provide feedback on how to improve Py-BoostLSS, or if you request additional distributions to be implemented, by opening a new issue or via the discussion section. 66 | 67 | 68 | ## Acknowledgements 69 | 70 | The implementation of Py-BoostLSS relies on the following resources: 71 | 72 | - [Py-boost: a research tool for exploring GBDTs](https://github.com/sb-ai-lab/Py-Boost) 73 | - [SketchBoost: Fast Gradient Boosted Decision Tree for Multioutput Problems](https://openreview.net/forum?id=WSxarC8t-T) 74 | 75 | We genuinely thank the original authors [Anton Vakhrushev](https://www.kaggle.com/btbpanda) and [Leonid Iosipoi](http://iosipoi.com/) for making their work publicly available. 76 | 77 | ## Reference Paper 78 | [![Arxiv link](https://img.shields.io/badge/arXiv-Multi%20Target%20XGBoostLSS%20Regression-color=brightgreen)](https://arxiv.org/abs/2210.06831)
79 | [![Arxiv link](https://img.shields.io/badge/arXiv-Distributional%20Gradient%20Boosting%20Machines-color=brightgreen)](https://arxiv.org/abs/2204.00778)
80 | [![Arxiv link](https://img.shields.io/badge/arXiv-XGBoostLSS%3A%20An%20extension%20of%20XGBoost%20to%20probabilistic%20forecasting-color=brightgreen)](https://arxiv.org/abs/1907.03178)
81 | 82 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /examples/Dirichlet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "22aca876-6dee-4cd3-a528-d2561a7ad301", 6 | "metadata": {}, 7 | "source": [ 8 | "
\n", 9 | "

Dirichlet Example

\n", 10 | "
\n", 11 | "\n", 12 | "
\n", 13 | "
" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "c6cdf104-7c02-4888-968e-12b55187ef39", 19 | "metadata": {}, 20 | "source": [ 21 | "The Dirichlet distribution is commonly used for modelling non-negative compositional data, i.e., data that consist of sub-sets that are fractions of some total. Compositional data are typically represented as proportions or percentages summing to 100\\%, so that the Dirichlet extends the univariate beta-distribution to the multivariate case. Compositional data analysis (CoDa) is a branch of statistics that deals with multivariate observations carrying relative information and finds widespread use in ecology, economics or political science. As a result of the unit-sum constraint, models that use distributions designed for unconstrained data typically suffer from the problem of spurious correlation when applied to compositional data. " 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "538d04c1-08b3-488b-984d-312bc9dad21a", 27 | "metadata": {}, 28 | "source": [ 29 | "In this example, we model and predict all parameters of a Dirichlet distribution with $Y_{D}=3$ target variables using the famous Arctic-Lake dataset. The density of the Dirichlet distribution with parameters $\\mathbf{\\alpha}_{\\mathbf{x}} = (\\alpha_{\\mathbf{x},1}, \\ldots, \\alpha_{\\mathbf{x},D}) \\in \\mathbb{R}^{D}_{+}$ with $\\sum^{D}_{d=1}y_{d}=1$ for all $y_{d}\\in \\left[0,1\\right]$ is given by" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "247dc33d-ac22-4e51-b500-d4b61fae21bf", 35 | "metadata": {}, 36 | "source": [ 37 | "$$\n", 38 | "f\\big(\\mathbf{y}|\\mathbf{\\theta}_{\\mathbf{x}}\\big) = \\frac{1}{\\mathrm{B}(\\mathbf{\\alpha}_{\\mathbf{x}})} \\prod_{d=1}^{D}y^{\\alpha_{\\mathbf{x},d-1}}_{d}\n", 39 | "$$" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "86229dd0-bccf-41a1-800f-18892f451a21", 45 | "metadata": {}, 46 | "source": [ 47 | "To ensure positivity, we use $\\exp(\\alpha_{\\mathbf{x},d})$ for all $d=1,\\ldots, D$. The estimated parameters have the interpretation of providing the probability of an event falling into category $d$, i.e., $\\mathbb{E}(y_{d}) = \\frac{\\alpha_{d}}{\\alpha_{0}}$, with $\\alpha_{0} = \\sum^{D}_{d=1}\\alpha_{d}$. For more details, we refer to our related paper **[März, Alexander (2022), *Multi-Target XGBoostLSS Regression*](https://arxiv.org/abs/2210.06831)**.\n", 48 | "\n", 49 | "
\n", 50 | "
" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "d7b4eea0-96b6-482a-afc9-7ec867bf2193", 56 | "metadata": {}, 57 | "source": [ 58 | "# Imports" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 1, 64 | "id": "0de768b2-ef44-434d-89e4-402170de0eb9", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import os\n", 69 | "import numpy as np\n", 70 | "import pandas as pd\n", 71 | "# Optional: set the device to run\n", 72 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 73 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 74 | "\n", 75 | "from pyboostlss.model import *\n", 76 | "from pyboostlss.distributions.DIRICHLET import *\n", 77 | "from pyboostlss.distributions.distribution_loss_metric import *\n", 78 | "from pyboostlss.utils import *\n", 79 | "from pyboostlss.datasets.data_loader import load_example_data\n", 80 | "\n", 81 | "import plotnine\n", 82 | "from plotnine import *\n", 83 | "plotnine.options.figure_size = (20, 10)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "8e90e9e4-5902-4299-9236-d31af4e29b2b", 89 | "metadata": {}, 90 | "source": [ 91 | "# Specifiy distribution and initialize model" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 2, 97 | "id": "75224245-9b11-4730-a4c6-d7fe6b3a272c", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "distribution = DIRICHLET(D=3) # Dirichlet distribution, where D specifies the number of target variables\n", 102 | "pyblss = PyBoostLSS(distribution) # Initializes model with specified distribution" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "43c36cd5-11f4-4c03-8297-c2555924bd4a", 108 | "metadata": {}, 109 | "source": [ 110 | "# Data" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "id": "78118dfc-6b05-4634-ae15-9fbf3d30b47b", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "data_df = load_example_data(\"arcticlake.csv\")\n", 121 | "\n", 122 | "# Create 80%, 10%, 10% split for train, validation and test \n", 123 | "train, validate, test = np.split(data_df.sample(frac=1,random_state=123), [int(0.8*len(data_df)), int(0.9*len(data_df))])\n", 124 | "\n", 125 | "# Train\n", 126 | "x_train = train[\"depth\"].values.reshape(-1,1)\n", 127 | "y_train = train.drop(columns=\"depth\").values\n", 128 | "dtrain = {\"X\": x_train, \"y\": y_train}\n", 129 | "\n", 130 | "# Validation\n", 131 | "x_eval = validate[\"depth\"].values.reshape(-1,1)\n", 132 | "y_eval = validate.drop(columns=\"depth\").values\n", 133 | "eval_sets = [{'X': x_eval, 'y': y_eval}] # Specifies eval_sets on which the model is evaluated on\n", 134 | "\n", 135 | "# Test\n", 136 | "x_test = test[\"depth\"].values.reshape(-1,1)\n", 137 | "y_test = test.drop(columns=\"depth\").values" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "id": "e742234a-ad21-4a32-aa80-08c59d318503", 143 | "metadata": {}, 144 | "source": [ 145 | "# Hyper-Parameter Optimization via Optuna" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "id": "88d8403d-ea56-4d44-ba26-1f86632b7b78", 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stderr", 156 | "output_type": "stream", 157 | "text": [ 158 | "\u001b[32m[I 2022-12-08 11:33:16,505]\u001b[0m A new study created in memory with name: Py-BoostLSS Hyper-Parameter Optimization\u001b[0m\n", 159 | "C:\\Users\\Alexander\\.julia\\v0.6\\Conda\\deps\\usr\\envs\\pyboost\\lib\\site-packages\\optuna\\progress_bar.py:49: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.\n" 160 | ] 161 | }, 162 | { 163 | "data": { 164 | "application/vnd.jupyter.widget-view+json": { 165 | "model_id": "08991189c4d8418e86d16f8f877de7a6", 166 | "version_major": 2, 167 | "version_minor": 0 168 | }, 169 | "text/plain": [ 170 | " 0%| | 0/10 [00:00\n", 337 | "\n", 350 | "\n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | "
alpha_1alpha_2alpha_3
08.9256085.9099831.225174
18.9256085.9099831.225174
21.24137310.1165544.594018
31.78219811.3567559.836135
48.9256085.9099831.225174
\n", 392 | "" 393 | ], 394 | "text/plain": [ 395 | " alpha_1 alpha_2 alpha_3\n", 396 | "0 8.925608 5.909983 1.225174\n", 397 | "1 8.925608 5.909983 1.225174\n", 398 | "2 1.241373 10.116554 4.594018\n", 399 | "3 1.782198 11.356755 9.836135\n", 400 | "4 8.925608 5.909983 1.225174" 401 | ] 402 | }, 403 | "execution_count": 6, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "# Predicts transformed parameters of the specified distribution. \n", 410 | "predt_params = distribution.predict(model=pyboostlss_model,\n", 411 | " X_test=x_train, # Here we use the train dataset to later infer the partial dependence of the parameters on x\n", 412 | " pred_type=\"parameters\")\n", 413 | "predt_params.head()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "id": "6dc11354-6241-4f53-b860-e12747660b75", 419 | "metadata": {}, 420 | "source": [ 421 | "Please note that the predicted parameters are not yet on the response-scale. Yet we can transform them easily as described above: the estimated parameters have the interpretation of providing the probability of an event falling into category $d$, i.e., $\\mathbb{E}(y_{d}) = \\frac{\\alpha_{d}}{\\alpha_{0}}$, with $\\alpha_{0} = \\sum^{D}_{d=1}\\alpha_{d}$." 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 10, 427 | "id": "9f21270c-a600-4104-8c99-60bcfa0e802f", 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "data": { 432 | "text/html": [ 433 | "
\n", 434 | "\n", 447 | "\n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | "
sandsiltclay
00.5557400.3679760.076284
10.5557400.3679760.076284
20.0778200.6341890.287991
30.0775710.4943070.428122
40.5557400.3679760.076284
\n", 489 | "
" 490 | ], 491 | "text/plain": [ 492 | " sand silt clay\n", 493 | "0 0.555740 0.367976 0.076284\n", 494 | "1 0.555740 0.367976 0.076284\n", 495 | "2 0.077820 0.634189 0.287991\n", 496 | "3 0.077571 0.494307 0.428122\n", 497 | "4 0.555740 0.367976 0.076284" 498 | ] 499 | }, 500 | "execution_count": 10, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "# Transform to response scale\n", 507 | "predt_params_transf = predt_params / predt_params.sum(axis=1)\n", 508 | "predt_params_transf = predt_params.div(predt_params.sum(axis=1), axis=0)\n", 509 | "predt_params_transf.columns = data_df.iloc[:,:3].columns\n", 510 | "\n", 511 | "predt_params_transf.head()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 8, 517 | "id": "c7e1a565-b8a6-4864-861e-75a81d558528", 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/plain": [ 523 | "(10000, 4, 3)" 524 | ] 525 | }, 526 | "execution_count": 8, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "# Draws random samples from the predicted distribution\n", 533 | "torch.manual_seed(123)\n", 534 | "n_samples = 10000\n", 535 | "predt_samples = distribution.predict(model=pyboostlss_model,\n", 536 | " X_test=x_test, \n", 537 | " pred_type=\"samples\", \n", 538 | " n_samples=n_samples)\n", 539 | "\n", 540 | "predt_samples.shape # Output-shape is (n_samples, n_obs, n_target)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "id": "6d32e5fd-ada9-41a8-896e-b446e606a3d2", 546 | "metadata": {}, 547 | "source": [ 548 | "# Partial Dependence Plot" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "id": "178932c6-32b8-4b45-a264-db6844e62d7d", 554 | "metadata": {}, 555 | "source": [ 556 | "Since there is only one covariate in the dataset, we can infer the effect of depth (in meters) on the sediment composition using a scatter-smooth estimate. The figure shows that with increasing depth, the relative frequency of sand decreases while the proportion of silt and clay increases." 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 9, 562 | "id": "2e79d446-6617-46ce-ae8b-c5209f26abb8", 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "image/png": "\n", 568 | "text/plain": [ 569 | "
" 570 | ] 571 | }, 572 | "metadata": {}, 573 | "output_type": "display_data" 574 | }, 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "plot_df = predt_params_transf\n", 585 | "plot_df[\"depth\"] = x_train\n", 586 | "\n", 587 | "plot_df = pd.melt(plot_df,\n", 588 | " id_vars=\"depth\")\n", 589 | "\n", 590 | "param_plot = (ggplot(plot_df,\n", 591 | " aes(x=\"depth\",\n", 592 | " y=\"value\")) + \n", 593 | " geom_point() + \n", 594 | " geom_smooth(span=0.7, se=False) + \n", 595 | " facet_wrap(\"variable\",\n", 596 | " scales=\"free\") + \n", 597 | " theme_bw() + \n", 598 | " theme(subplots_adjust={\"wspace\": 0.25}) + \n", 599 | " labs(title = \"Partial-Dependence-Plot of Dirichlet-Parameters estimated via Py-BoostLSS\\n\",\n", 600 | " y=\"Parameter Estimate\",\n", 601 | " x=\"Depth (in meters)\") \n", 602 | " )\n", 603 | "\n", 604 | "print(param_plot)" 605 | ] 606 | } 607 | ], 608 | "metadata": { 609 | "kernelspec": { 610 | "display_name": "Python 3 (ipykernel)", 611 | "language": "python", 612 | "name": "python3" 613 | }, 614 | "language_info": { 615 | "codemirror_mode": { 616 | "name": "ipython", 617 | "version": 3 618 | }, 619 | "file_extension": ".py", 620 | "mimetype": "text/x-python", 621 | "name": "python", 622 | "nbconvert_exporter": "python", 623 | "pygments_lexer": "ipython3", 624 | "version": "3.9.15" 625 | } 626 | }, 627 | "nbformat": 4, 628 | "nbformat_minor": 5 629 | } 630 | -------------------------------------------------------------------------------- /pyboostlss/__init__.py: -------------------------------------------------------------------------------- 1 | """Py-BoostLSS - An extension of Py-Boost to probabilistic modelling""" -------------------------------------------------------------------------------- /pyboostlss/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Py-BoostLSS - An extension of Py-Boost to probabilistic modelling""" -------------------------------------------------------------------------------- /pyboostlss/datasets/arcticlake.csv: -------------------------------------------------------------------------------- 1 | sand,silt,clay,depth 2 | 0.775,0.195,0.03,10.4 3 | 0.719,0.249,0.032,11.7 4 | 0.507,0.361,0.132,12.8 5 | 0.52357,0.41023,0.0662,13 6 | 0.7,0.265,0.035,15.7 7 | 0.665,0.322,0.013,16.3 8 | 0.431,0.553,0.016,18 9 | 0.534,0.368,0.098,18.7 10 | 0.155,0.544,0.301,20.7 11 | 0.317,0.415,0.268,22.1 12 | 0.657,0.278,0.065,22.4 13 | 0.704,0.29,0.006,24.4 14 | 0.174,0.536,0.29,25.8 15 | 0.106,0.698,0.196,32.5 16 | 0.382,0.431,0.187,33.6 17 | 0.108,0.527,0.365,36.8 18 | 0.184,0.507,0.309,37.8 19 | 0.046,0.474,0.48,36.9 20 | 0.156,0.504,0.34,42.2 21 | 0.319,0.451,0.23,47 22 | 0.095,0.535,0.37,47.1 23 | 0.171,0.48,0.349,48.4 24 | 0.105,0.554,0.341,49.4 25 | 0.04776,0.54428,0.40796,49.5 26 | 0.026,0.452,0.522,59.2 27 | 0.114,0.527,0.359,60.1 28 | 0.067,0.469,0.464,61.7 29 | 0.069,0.497,0.434,62.4 30 | 0.04,0.449,0.511,69.3 31 | 0.07407,0.51652,0.40941,73.6 32 | 0.048,0.495,0.457,74.4 33 | 0.045,0.485,0.47,78.5 34 | 0.066,0.521,0.413,82.9 35 | 0.06707,0.47347,0.45946,87.7 36 | 0.07407,0.45646,0.46947,88.1 37 | 0.06,0.489,0.451,90.4 38 | 0.063,0.538,0.399,90.6 39 | 0.025,0.48,0.495,97.7 40 | 0.02,0.478,0.502,103.7 41 | -------------------------------------------------------------------------------- /pyboostlss/datasets/data_loader.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | import pandas as pd 3 | 4 | 5 | def load_example_data(dta_name: str) -> pd.DataFrame: 6 | """Returns dataframe of a sepecified simulated dataset example. 7 | """ 8 | data_path = pkg_resources.resource_stream(__name__, dta_name) 9 | data_df = pd.read_csv(data_path) 10 | 11 | return data_df -------------------------------------------------------------------------------- /pyboostlss/distributions/DIRICHLET.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions.dirichlet import Dirichlet 3 | from dirichlet.dirichlet import mle as dirichlet_mle 4 | import cupy as cp 5 | import numpy as np 6 | import pandas as pd 7 | from pyboostlss.utils import * 8 | 9 | 10 | 11 | ######################################################################################################################## 12 | ############################################### Dirichlet Distribution ######################################### 13 | ######################################################################################################################## 14 | 15 | class DIRICHLET: 16 | """Dirichlet Distribution Class 17 | """ 18 | 19 | def __init__(self, D:int): 20 | self.D = D # specifies target dimension 21 | 22 | 23 | def initialize(self, y_true: cp.ndarray, n_target: int) -> cp.ndarray: 24 | """ Function that calculates the starting values, for each distributional parameter individually. 25 | y_true: cp.ndarray 26 | Data from which starting values are calculated. 27 | n_target: ndarray 28 | Number of target variables 29 | """ 30 | 31 | start_values = np.log(dirichlet_mle(cp.asnumpy(y_true))) 32 | 33 | return cp.array(start_values) 34 | 35 | 36 | def n_dist_param(self, n_targets: int) -> int: 37 | """Infers the number of distributional parameters from target dimension. 38 | """ 39 | 40 | return n_targets 41 | 42 | 43 | 44 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray: 45 | """Function that appends target to the number of specified parameters 46 | """ 47 | 48 | return cp.array(y_true) 49 | 50 | 51 | 52 | 53 | def create_param_dict(self, n_target): 54 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions. 55 | """ 56 | 57 | # Alpha 58 | param_dict = {"alpha_" + str(i+1): exp_fn for i in range(n_target)} 59 | 60 | return param_dict 61 | 62 | 63 | 64 | 65 | 66 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor: 67 | """ Returns estimated parameters and nll. 68 | 69 | Args: 70 | y_true: cp.ndarray, Input target variables 71 | y_pred: cp.ndarray, predictions 72 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation 73 | 74 | Returns: 75 | predt, nll 76 | """ 77 | 78 | ### 79 | # Initialize 80 | ### 81 | n_target = n_param = self.D 82 | param_dict = self.create_param_dict(n_target) 83 | 84 | 85 | ### 86 | # Target 87 | ### 88 | target = torch.as_tensor(y_true, device="cuda").reshape(-1, n_target) 89 | 90 | 91 | 92 | ### 93 | # Parameters 94 | ### 95 | predt = [ 96 | torch.tensor( 97 | y_pred[:,i].reshape(-1,1), device="cuda", requires_grad=requires_grad 98 | ) for i in range(n_param) 99 | ] 100 | 101 | # Alpha 102 | predt_alpha = torch.concat( 103 | [response_fun(predt[i]) for i, (dist_param, response_fun) in enumerate(param_dict.items())], 104 | axis=1 105 | ) 106 | 107 | 108 | 109 | ### 110 | # NLL 111 | ### 112 | dist_fit = Dirichlet(predt_alpha) 113 | nll = -torch.nansum(dist_fit.log_prob(target)) 114 | 115 | return predt, nll 116 | 117 | 118 | 119 | 120 | def predict(self, 121 | model, 122 | X_test: np.array, 123 | pred_type: str = "parameters", 124 | n_samples: int = 100 125 | ): 126 | """ 127 | Predict function. 128 | 129 | model: 130 | Instance of pyboostlss 131 | X_test: np.array 132 | Test data features 133 | pred_type: str 134 | Specifies what is to be predicted: 135 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target) 136 | "parameters": returns the predicted distributional parameters. 137 | n_samples: int 138 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution. 139 | Returns 140 | ------- 141 | pd.DataFrame with n_samples drawn from predicted response distribution. 142 | 143 | """ 144 | 145 | n_target = self.D 146 | param_dict = self.create_param_dict(n_target) 147 | dist_params = list(param_dict.keys()) 148 | 149 | # Predicted parameters 150 | params_predt = torch.tensor(model.predict(X_test), device="cuda") 151 | params_predt = torch.cat( 152 | [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())], 153 | axis=1) 154 | 155 | # Predicted Distribution 156 | dirichlet_pred = Dirichlet(params_predt) 157 | 158 | # Output DataFrame 159 | predt_params = pd.DataFrame(params_predt.cpu().detach().numpy(),columns=dist_params) 160 | 161 | if pred_type == "parameters": 162 | return predt_params 163 | 164 | elif pred_type == "samples": 165 | torch.manual_seed(123) 166 | y_samples = dirichlet_pred.sample((n_samples,)).cpu().detach().numpy() 167 | return y_samples 168 | -------------------------------------------------------------------------------- /pyboostlss/distributions/MVN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions.multivariate_normal import MultivariateNormal 3 | import cupy as cp 4 | import numpy as np 5 | import pandas as pd 6 | from pyboostlss.utils import * 7 | 8 | 9 | 10 | ######################################################################################################################## 11 | ############################################### Multivariate Normal ########################################## 12 | ######################################################################################################################## 13 | 14 | class MVN: 15 | """Multivariate Normal Distribution Class, where covariance matrix \Sigma is estimated via Cholesky-decomposition 16 | \Sigma = LL`. 17 | """ 18 | 19 | def __init__(self,D:int): 20 | self.D = D # specifies target dimension 21 | 22 | 23 | def initialize(self, y_true: cp.ndarray, n_target: int) -> cp.ndarray: 24 | """ Function that calculates the starting values, for each distributional parameter individually. 25 | y_true: cp.ndarray 26 | Data from which starting values are calculated. 27 | n_target: ndarray 28 | Number of target variables 29 | """ 30 | # Indices 31 | tril_indices = cp.asarray(np.tril_indices(n_target)) 32 | 33 | # Target 34 | target = y_true[:,:n_target] 35 | 36 | # Location 37 | loc_init = cp.mean(target,axis=0) 38 | 39 | # Tril 40 | tril_init = cp.cov(target,rowvar=False) 41 | tril_init = cp.linalg.cholesky(tril_init) 42 | cp.fill_diagonal(tril_init, cp.log(cp.diagonal(tril_init))) 43 | tril_init = tril_init[tril_indices[0], tril_indices[1]] 44 | start_values = cp.concatenate([loc_init, tril_init]) 45 | 46 | return start_values 47 | 48 | 49 | 50 | def n_dist_param(self, n_targets: int) -> int: 51 | """Infers the number of distributional parameters from target dimension. 52 | """ 53 | n_param = int((n_targets*(n_targets + 3))/2) 54 | 55 | return n_param 56 | 57 | 58 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray: 59 | """Function that appends target to the number of specified parameters 60 | """ 61 | n_obs = y_true.shape[0] 62 | n_target = y_true.shape[1] 63 | n_fill = n_param - n_target 64 | np_fill = np.ones((n_obs, n_fill)) 65 | y_append = np.concatenate([y_true, np_fill],axis=1) 66 | 67 | return y_append 68 | 69 | 70 | 71 | def tril_dim(self, n_target: int) -> int: 72 | """Infers the number of lower diagonal elements from number of targets. 73 | """ 74 | n_tril = int((n_target * (n_target + 1)) / 2) 75 | 76 | return n_tril 77 | 78 | 79 | def rho_dim(self, n_target: int) -> int: 80 | """Infers the number of correlations from number of targets. 81 | """ 82 | n_rho = int((n_target * (n_target - 1)) / 2) 83 | return n_rho 84 | 85 | 86 | 87 | def create_param_dict(self, n_target, tril_indices): 88 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions. 89 | """ 90 | 91 | n_theta = self.n_dist_param(n_target) 92 | n_tril = self.tril_dim(n_target) 93 | 94 | 95 | # Location 96 | param_dict = {"location_" + str(i+1): identity_fn for i in range(n_target)} 97 | 98 | # Tril 99 | tril_idx = (tril_indices.detach().numpy()) + 1 100 | tril_indices_row = tril_idx[0] 101 | tril_indices_col = tril_idx[1] 102 | tril_diag = tril_idx[0] == tril_idx[1] 103 | 104 | tril_dict = {} 105 | 106 | for i in range(n_tril): 107 | if tril_diag[i] == True: 108 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn}) 109 | else: 110 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn}) 111 | 112 | param_dict.update(tril_dict) 113 | 114 | return param_dict 115 | 116 | 117 | 118 | def create_tril_dict(self, n_target, tril_indices): 119 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions. 120 | """ 121 | 122 | n_theta = self.n_dist_param(n_target) 123 | n_tril = self.tril_dim(n_target) 124 | 125 | # Tril 126 | tril_idx = (tril_indices.detach().numpy()) + 1 127 | tril_indices_row = tril_idx[0] 128 | tril_indices_col = tril_idx[1] 129 | tril_diag = tril_idx[0] == tril_idx[1] 130 | 131 | tril_dict = {} 132 | 133 | for i in range(n_tril): 134 | if tril_diag[i] == True: 135 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn}) 136 | else: 137 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn}) 138 | 139 | return tril_dict 140 | 141 | 142 | 143 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor: 144 | """ Returns estimated parameters and nll. 145 | 146 | Args: 147 | y_true: cp.ndarray, Input target variables 148 | y_pred: cp.ndarray, predictions 149 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation 150 | 151 | Returns: 152 | predt, nll 153 | """ 154 | 155 | ### 156 | # Initialize 157 | ### 158 | n_obs = y_true.shape[0] 159 | n_param = y_true.shape[1] 160 | n_target = self.D 161 | n_tril = self.tril_dim(n_target) 162 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0) 163 | param_dict = self.create_param_dict(n_target,tril_indices) 164 | tril_param_dict = self.create_tril_dict(n_target,tril_indices) 165 | 166 | 167 | ### 168 | # Target 169 | ### 170 | target = torch.as_tensor(y_true[:,:n_target], device="cuda").reshape(-1, n_target) 171 | 172 | 173 | 174 | ### 175 | # Parameters 176 | ### 177 | predt = [torch.tensor(y_pred[:,i].reshape(-1,1), device="cuda", requires_grad=requires_grad) for i in range(n_param)] 178 | 179 | # Location 180 | predt_location = torch.concat(predt[:n_target],axis=1) 181 | 182 | # Tril 183 | tril_predt = predt[n_target:] 184 | tril_predt = [response_fun(tril_predt[i]) for i, (dist_param, response_fun) in enumerate(tril_param_dict.items())] 185 | tril_predt = torch.concat(tril_predt,axis=1) 186 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda") 187 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt 188 | 189 | 190 | ### 191 | # NLL 192 | ### 193 | dist_fit = MultivariateNormal(loc=predt_location, scale_tril=predt_tril) 194 | nll = -torch.nansum(dist_fit.log_prob(target)) 195 | 196 | return predt, nll 197 | 198 | 199 | 200 | 201 | def predict(self, 202 | model, 203 | X_test: np.array, 204 | pred_type: str = "parameters", 205 | n_samples: int = 100 206 | ): 207 | """ 208 | Predict function. 209 | 210 | model: 211 | Instance of pyboostlss 212 | X_test: np.array 213 | Test data features 214 | pred_type: str 215 | Specifies what is to be predicted: 216 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target) 217 | "parameters": returns the predicted distributional parameters. 218 | n_samples: int 219 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution. 220 | Returns 221 | ------- 222 | pd.DataFrame with n_samples drawn from predicted response distribution. 223 | 224 | """ 225 | 226 | n_target = self.D 227 | n_tril = self.tril_dim(n_target) 228 | n_rho = self.rho_dim(n_target) 229 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0) 230 | param_dict = self.create_param_dict(n_target,tril_indices) 231 | dist_params = list(param_dict.keys()) 232 | 233 | # Predicted parameters 234 | params_predt = torch.tensor(model.predict(X_test), device="cuda") 235 | params_predt = [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())] 236 | 237 | 238 | # Location 239 | predt_location = torch.cat(params_predt[:n_target],axis=1) 240 | predt_location_df = pd.DataFrame(predt_location.cpu().detach().numpy()) 241 | predt_location_df.columns = [param for param in dist_params if "location_" in param] 242 | 243 | # Tril 244 | n_obs = X_test.shape[0] 245 | tril_predt = torch.cat(params_predt[n_target:],axis=1).reshape(-1, n_tril) 246 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda") 247 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt 248 | 249 | # Predicted Distribution 250 | mvn_pred = MultivariateNormal(loc=predt_location, scale_tril=predt_tril) 251 | 252 | # Sigma 253 | predt_sigma = mvn_pred.stddev.cpu().detach().numpy() 254 | predt_sigma_df = pd.DataFrame(predt_sigma) 255 | predt_sigma_df.columns = [param for param in dist_params if "scale_" in param] 256 | 257 | # Rho 258 | cov_mat = mvn_pred.covariance_matrix 259 | predt_rho = torch.cat([calc_corr(cov_mat[i]).reshape(-1, n_rho) for i in range(n_obs)],axis=0) 260 | predt_rho_df = pd.DataFrame(predt_rho.cpu().detach().numpy()) 261 | predt_rho_df.columns = [param for param in dist_params if "rho_" in param] 262 | 263 | # Output DataFrame 264 | predt_params = pd.concat([predt_location_df, predt_sigma_df, predt_rho_df], axis=1) 265 | 266 | if pred_type == "parameters": 267 | return predt_params 268 | 269 | elif pred_type == "samples": 270 | torch.manual_seed(123) 271 | y_samples = mvn_pred.sample((n_samples,)).cpu().detach().numpy() 272 | return y_samples 273 | -------------------------------------------------------------------------------- /pyboostlss/distributions/MVN_LRA.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions.lowrank_multivariate_normal import LowRankMultivariateNormal 3 | import torch.optim as optim 4 | import cupy as cp 5 | import numpy as np 6 | import pandas as pd 7 | from pyboostlss.utils import * 8 | 9 | 10 | 11 | ######################################################################################################################## 12 | ############################################### Multivariate Normal ########################################## 13 | ######################################################################################################################## 14 | 15 | class MVN_LRA: 16 | """Multivariate Normal Distribution Class, where covariance matrix \Sigma is estimated via LRA apprixmation. 17 | """ 18 | 19 | def __init__(self, 20 | r:int, 21 | D:int): 22 | self.r = r # specifies rank 23 | self.D = D # specifies target dimension 24 | self.dtype = torch.float32 25 | 26 | 27 | 28 | def initialize(self, y_true: cp.ndarray, n_target: list) -> cp.ndarray: 29 | """ Function that calculates the starting values, for each distributional parameter individually. It uses the L-BFGS algorithm for estimating unconditional parameter estimates. 30 | 31 | y_true: cp.ndarray 32 | Data from which starting values are calculated. 33 | n_target: list 34 | List that holds number of targets and rank-parameter. 35 | """ 36 | 37 | torch.manual_seed(123) 38 | 39 | n_param = self.n_dist_param(n_target) 40 | n_target = self.D 41 | param_init = torch.ones(1, n_param, device="cuda", dtype=self.dtype) 42 | param_init = torch.nn.init.xavier_uniform_(param_init) 43 | param_init.requires_grad=True 44 | y_true_tens = torch.tensor(y_true[:,:n_target], device="cuda", dtype=self.dtype) 45 | 46 | 47 | def nll_init(y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=True) -> torch.tensor: 48 | 49 | n_target = self.D 50 | n_param = self.n_dist_param([self.D, self.r]) 51 | rank = self.r 52 | 53 | ### 54 | # Target 55 | ### 56 | target = y_true[:,:n_target] 57 | 58 | ### 59 | # Parameters 60 | ### 61 | predt = [y_pred[:, i].reshape(-1,1) for i in range(n_param)] 62 | 63 | # Location 64 | predt_location = torch.concat(predt[:n_target],axis=1) 65 | 66 | # Low Rank Factor 67 | predt_covfactor = torch.concat(predt[n_target:(n_param-n_target)], axis=1).reshape(-1, n_target, rank) # (n_obs, n_target, rank) 68 | 69 | # Low Rank Diagonal (must be positive) 70 | predt_covdiag = predt[-n_target:] 71 | predt_covdiag = [exp_fn(predt_covdiag[i]) for i in range(len(predt_covdiag))] 72 | predt_covdiag = torch.concat(predt_covdiag, axis=1) 73 | 74 | ### 75 | # NLL 76 | ### 77 | dist_fit = LowRankMultivariateNormal(loc=predt_location, cov_factor=predt_covfactor, cov_diag=predt_covdiag, validate_args=False) 78 | nll = -torch.nansum(dist_fit.log_prob(target)) 79 | 80 | return nll 81 | 82 | 83 | def closure(): 84 | 85 | lbfgs.zero_grad() 86 | objective = nll_init(y_true=y_true_tens, y_pred=param_init) 87 | objective.backward() 88 | 89 | return objective 90 | 91 | 92 | 93 | lbfgs = optim.LBFGS(params=[param_init], 94 | lr=1e-03, 95 | history_size=10, 96 | max_iter=4, 97 | line_search_fn="strong_wolfe") 98 | 99 | 100 | for i in range(20): 101 | lbfgs.step(closure) 102 | 103 | start_values = cp.array(lbfgs.param_groups[0]["params"][0].cpu().detach()).reshape(-1,) 104 | 105 | return start_values 106 | 107 | 108 | 109 | 110 | # def initialize(self, y_true: cp.ndarray, n_target: list) -> cp.ndarray: 111 | # """ Function that initializes each distributional parameter with ones. Compared to the LBFGS, this is more runtime efficient. 112 | # y_true: cp.ndarray 113 | # Data from which starting values are calculated. 114 | # n_target: list 115 | # List that holds number of targets and rank-parameter. 116 | # """ 117 | # n_param = self.n_dist_param(n_target) 118 | # start_values = cp.ones((n_param,)) 119 | 120 | # return start_values 121 | 122 | 123 | 124 | 125 | def n_dist_param(self, n_targets: list) -> int: 126 | """Number of distributional parameters. 127 | """ 128 | n_param = int(n_targets[0]*(2+n_targets[1])) 129 | 130 | return n_param 131 | 132 | 133 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray: 134 | """Function that appends target to the number of specified parameters 135 | """ 136 | n_obs = y_true.shape[0] 137 | n_target = y_true.shape[1] 138 | n_fill = n_param - n_target 139 | np_fill = np.ones((n_obs, n_fill)) 140 | y_append = np.concatenate([y_true, np_fill],axis=1) 141 | 142 | return y_append 143 | 144 | 145 | 146 | def tril_dim(self, n_target: int) -> int: 147 | """Infers the number of lower diagonal elements from number of targets. 148 | """ 149 | n_tril = int((n_target * (n_target + 1)) / 2) 150 | 151 | return n_tril 152 | 153 | 154 | def rho_dim(self, n_target: int) -> int: 155 | """Infers the number of correlations from number of targets. 156 | """ 157 | n_rho = int((n_target * (n_target - 1)) / 2) 158 | return n_rho 159 | 160 | 161 | def create_param_dict(self, n_target): 162 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions. 163 | """ 164 | n_target = self.D 165 | rank = self.r 166 | 167 | # Location 168 | param_dict = {"location_" + str(i+1): identity_fn for i in range(n_target)} 169 | 170 | # Low Rank Factor 171 | lrf_dict = {"lrf_" + str(i+1): identity_fn for i in range(n_target*rank)} 172 | param_dict.update(lrf_dict) 173 | 174 | # Low Rank Diagonal 175 | lrd_dict = {"lrd_" + str(i+1): exp_fn for i in range(n_target)} 176 | param_dict.update(lrd_dict) 177 | 178 | return param_dict 179 | 180 | 181 | def param_names(self, n_target): 182 | """ List that holds the name of distributional parameter. 183 | """ 184 | 185 | n_tril = self.tril_dim(n_target) 186 | 187 | # Location 188 | param_names = ["location_" + str(i+1) for i in range(n_target)] 189 | 190 | # Tril 191 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0) 192 | tril_idx = (tril_indices.detach().numpy()) + 1 193 | tril_indices_row = tril_idx[0] 194 | tril_indices_col = tril_idx[1] 195 | tril_diag = tril_idx[0] == tril_idx[1] 196 | 197 | for i in range(n_tril): 198 | if tril_diag[i] == True: 199 | param_names.append("scale_" + str(tril_idx[:,i][1])) 200 | else: 201 | param_names.append("rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1])) 202 | 203 | return param_names 204 | 205 | 206 | 207 | 208 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor: 209 | """ Returns estimated parameters and nll. 210 | 211 | Args: 212 | y_true: cp.ndarray, Input target variables 213 | y_pred: cp.ndarray, predictions 214 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation 215 | 216 | Returns: 217 | predt, nll 218 | """ 219 | 220 | ### 221 | # Initialize 222 | ### 223 | n_obs = y_true.shape[0] 224 | n_param = y_true.shape[1] 225 | n_target = self.D 226 | rank = self.r 227 | param_dict = self.create_param_dict(n_target) 228 | 229 | 230 | ### 231 | # Target 232 | ### 233 | target = torch.as_tensor(y_true[:,:n_target], device="cuda", dtype=self.dtype).reshape(-1, n_target) 234 | 235 | 236 | ### 237 | # Parameters 238 | ### 239 | predt = [torch.tensor(np.nan_to_num(y_pred[:, i], nan=float(np.nanmean(y_pred[:, i]))), device="cuda", requires_grad=requires_grad, dtype=self.dtype).reshape(-1,1) for i in range(n_param)] 240 | 241 | # Location 242 | predt_location = torch.concat(predt[:n_target],axis=1) 243 | 244 | # Low Rank Factor 245 | predt_covfactor = torch.concat(predt[n_target:(n_param-n_target)], axis=1).reshape(-1, n_target, rank) # (n_obs, n_target, rank) 246 | 247 | # Low Rank Diagonal (must be positive) 248 | predt_covdiag = predt[-n_target:] 249 | predt_covdiag = [exp_fn(predt_covdiag[i]) for i in range(len(predt_covdiag))] 250 | predt_covdiag = torch.concat(predt_covdiag, axis=1) 251 | 252 | 253 | ### 254 | # NLL 255 | ### 256 | dist_fit = LowRankMultivariateNormal(loc=predt_location, cov_factor=predt_covfactor, cov_diag=predt_covdiag, validate_args=False) 257 | nll = -torch.nansum(dist_fit.log_prob(target)) 258 | 259 | return predt, nll 260 | 261 | 262 | 263 | 264 | def predict(self, 265 | model, 266 | X_test: np.array, 267 | pred_type: str = "parameters", 268 | n_samples: int = 100 269 | ): 270 | """ 271 | Predict function. 272 | 273 | model: 274 | Instance of pyboostlss 275 | X_test: np.array 276 | Test data features 277 | pred_type: str 278 | Specifies what is to be predicted: 279 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target) 280 | "parameters": returns the predicted distributional parameters. 281 | n_samples: int 282 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution. 283 | Returns 284 | ------- 285 | pd.DataFrame with n_samples drawn from predicted response distribution. 286 | 287 | """ 288 | 289 | n_obs = X_test.shape[0] 290 | n_target = self.D 291 | rank = self.r 292 | n_param = self.n_dist_param([n_target, rank]) 293 | n_rho = self.rho_dim(n_target) 294 | param_dict = self.create_param_dict(n_target) 295 | dist_params = self.param_names(n_target) 296 | 297 | # Predicted parameters 298 | params_predt = torch.tensor(model.predict(X_test), device="cuda") 299 | params_predt = [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())] 300 | 301 | 302 | # Location 303 | predt_location = torch.cat(params_predt[:n_target],axis=1) 304 | predt_location_df = pd.DataFrame(predt_location.cpu().detach().numpy()) 305 | predt_location_df.columns = [param for param in dist_params if "location_" in param] 306 | 307 | # Low Rank Factor 308 | predt_covfactor = torch.cat(params_predt[n_target:(n_param-n_target)], axis=1).reshape(-1, n_target, rank) # (n_obs, n_target, rank) 309 | 310 | # Low Rank Diagonal 311 | predt_covdiag = torch.cat(params_predt[-n_target:], axis=1) 312 | 313 | # Predicted Distribution 314 | mvn_lra_pred = LowRankMultivariateNormal(loc=predt_location, cov_factor=predt_covfactor, cov_diag=predt_covdiag, validate_args=False) 315 | 316 | # Sigma 317 | predt_sigma = mvn_lra_pred.stddev.cpu().detach().numpy() 318 | predt_sigma_df = pd.DataFrame(predt_sigma) 319 | predt_sigma_df.columns = [param for param in dist_params if "scale_" in param] 320 | 321 | # Rho 322 | cov_mat = mvn_lra_pred.covariance_matrix 323 | predt_rho = torch.cat([calc_corr(cov_mat[i]).reshape(-1, n_rho) for i in range(n_obs)],axis=0) 324 | predt_rho_df = pd.DataFrame(predt_rho.cpu().detach().numpy()) 325 | predt_rho_df.columns = [param for param in dist_params if "rho_" in param] 326 | 327 | # Output DataFrame 328 | params_df = pd.concat([predt_location_df, predt_sigma_df, predt_rho_df], axis=1) 329 | 330 | if pred_type == "parameters": 331 | return params_df 332 | 333 | elif pred_type == "samples": 334 | torch.manual_seed(123) 335 | y_samples = mvn_lra_pred.sample((n_samples,)).cpu().detach().numpy() 336 | return y_samples 337 | -------------------------------------------------------------------------------- /pyboostlss/distributions/MVT.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pyro.distributions import MultivariateStudentT 3 | from scipy.stats import t 4 | import cupy as cp 5 | import numpy as np 6 | import pandas as pd 7 | from pyboostlss.utils import * 8 | 9 | 10 | 11 | ######################################################################################################################## 12 | ############################################### Multivariate Student-T ########################################## 13 | ######################################################################################################################## 14 | 15 | class MVT: 16 | """Multivariate Student-T Distribution Class, where covariance matrix \Sigma is estimated via Cholesky-decomposition 17 | \Sigma = LL`. 18 | """ 19 | 20 | def __init__(self,D:int): 21 | self.D = D # specifies target dimension 22 | 23 | 24 | def initialize(self, y_true: cp.ndarray, n_target: int) -> cp.ndarray: 25 | """ Function that calculates the starting values, for each distributional parameter individually. 26 | y_true: cp.ndarray 27 | Data from which starting values are calculated. 28 | n_target: ndarray 29 | Number of target variables 30 | """ 31 | # Target 32 | target = y_true[:,:n_target] 33 | 34 | # Fitted Student-T Parameters 35 | student_param = cp.array([t.fit(cp.asnumpy(target[:,i])) for i in range(n_target)]) 36 | 37 | # Df 38 | df_init = (cp.log(2) + cp.log(cp.median(student_param[:,0]))).reshape(-1,) 39 | 40 | # Location 41 | loc_init = cp.mean(target,axis=0) 42 | 43 | # Tril 44 | tril_indices = cp.asarray(np.tril_indices(n_target)) 45 | tril_init = cp.cov(target,rowvar=False) 46 | tril_init = cp.linalg.cholesky(tril_init) 47 | cp.fill_diagonal(tril_init, cp.log(cp.diagonal(tril_init))) 48 | tril_init = tril_init[tril_indices[0], tril_indices[1]] 49 | 50 | start_values = cp.concatenate([df_init, loc_init, tril_init]) 51 | 52 | return start_values 53 | 54 | 55 | 56 | def n_dist_param(self, n_targets: int) -> int: 57 | """Infers the number of distributional parameters from target dimension. 58 | """ 59 | n_param = int(1 + ((n_targets*(n_targets + 3))/2)) 60 | 61 | return n_param 62 | 63 | 64 | def target_append(self, y_true: np.ndarray, n_param: int) -> np.ndarray: 65 | """Function that appends target to the number of specified parameters 66 | """ 67 | n_obs = y_true.shape[0] 68 | n_target = y_true.shape[1] 69 | n_fill = n_param - n_target 70 | np_fill = np.ones((n_obs, n_fill)) 71 | y_append = np.concatenate([y_true, np_fill],axis=1) 72 | 73 | return y_append 74 | 75 | 76 | def tril_dim(self, n_target: int) -> int: 77 | """Infers the number of lower diagonal elements from number of targets. 78 | """ 79 | n_tril = int((n_target * (n_target + 1)) / 2) 80 | 81 | return n_tril 82 | 83 | 84 | def rho_dim(self, n_target: int) -> int: 85 | """Infers the number of correlations from number of targets. 86 | """ 87 | n_rho = int((n_target * (n_target - 1)) / 2) 88 | return n_rho 89 | 90 | 91 | 92 | def create_param_dict(self, n_target, tril_indices): 93 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions. 94 | """ 95 | 96 | n_theta = self.n_dist_param(n_target) 97 | n_tril = self.tril_dim(n_target) 98 | 99 | # Df 100 | param_dict = {"df": exp_fn_df} 101 | 102 | # Location 103 | loc_dict = {"location_" + str(i+1): identity_fn for i in range(n_target)} 104 | 105 | param_dict.update(loc_dict) 106 | 107 | # Tril 108 | tril_idx = (tril_indices.detach().numpy()) + 1 109 | tril_indices_row = tril_idx[0] 110 | tril_indices_col = tril_idx[1] 111 | tril_diag = tril_idx[0] == tril_idx[1] 112 | 113 | tril_dict = {} 114 | 115 | for i in range(n_tril): 116 | if tril_diag[i] == True: 117 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn}) 118 | else: 119 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn}) 120 | 121 | param_dict.update(tril_dict) 122 | 123 | return param_dict 124 | 125 | 126 | def create_tril_dict(self, n_target, tril_indices): 127 | """ Dictionary that holds the name of distributional parameter and their corresponding response functions. 128 | """ 129 | 130 | n_theta = self.n_dist_param(n_target) 131 | n_tril = self.tril_dim(n_target) 132 | 133 | # Tril 134 | tril_idx = (tril_indices.detach().numpy()) + 1 135 | tril_indices_row = tril_idx[0] 136 | tril_indices_col = tril_idx[1] 137 | tril_diag = tril_idx[0] == tril_idx[1] 138 | 139 | tril_dict = {} 140 | 141 | for i in range(n_tril): 142 | if tril_diag[i] == True: 143 | tril_dict.update({"scale_" + str(tril_idx[:,i][1]): exp_fn}) 144 | else: 145 | tril_dict.update({"rho_" + str(tril_idx[:,i][0]) + str(tril_idx[:,i][1]): identity_fn}) 146 | 147 | return tril_dict 148 | 149 | 150 | def get_params_nll(self, y_true: cp.ndarray, y_pred: cp.ndarray, requires_grad=False) -> torch.tensor: 151 | """ Returns estimated parameters and nll. 152 | 153 | Args: 154 | y_true: cp.ndarray, Input target variables 155 | y_pred: cp.ndarray, predictions 156 | requires_grad: bool(), Whether or not tensor requires gradient for automatic differentiation 157 | 158 | Returns: 159 | predt, nll 160 | """ 161 | 162 | ### 163 | # Initialize 164 | ### 165 | n_obs = y_true.shape[0] 166 | n_param = y_true.shape[1] 167 | n_target = self.D 168 | n_tril = self.tril_dim(n_target) 169 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0) 170 | param_dict = self.create_param_dict(n_target,tril_indices) 171 | tril_param_dict = self.create_tril_dict(n_target,tril_indices) 172 | 173 | 174 | ### 175 | # Target 176 | ### 177 | target = torch.as_tensor(y_true[:,:n_target], device="cuda").reshape(-1, n_target) 178 | 179 | 180 | 181 | ### 182 | # Parameters 183 | ### 184 | predt = [torch.tensor(y_pred[:,i].reshape(-1,1), device="cuda", requires_grad=requires_grad) for i in range(n_param)] 185 | 186 | # Df 187 | predt_df = exp_fn_df(predt[0]).reshape(-1,) 188 | 189 | # Location 190 | predt_location = torch.concat(predt[1:(n_target+1)],axis=1) 191 | 192 | # Tril 193 | tril_predt = predt[(n_target+1):] 194 | tril_predt = [response_fun(tril_predt[i]) for i, (dist_param, response_fun) in enumerate(tril_param_dict.items())] 195 | tril_predt = torch.concat(tril_predt,axis=1) 196 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda") 197 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt 198 | 199 | 200 | ### 201 | # NLL 202 | ### 203 | dist_fit = MultivariateStudentT(predt_df, predt_location, predt_tril) 204 | nll = -torch.nansum(dist_fit.log_prob(target)) 205 | 206 | return predt, nll 207 | 208 | 209 | def predict(self, 210 | model, 211 | X_test: np.array, 212 | pred_type: str = "parameters", 213 | n_samples: int = 100 214 | ): 215 | """ 216 | Predict function. 217 | 218 | model: 219 | Instance of pyboostlss 220 | X_test: np.array 221 | Test data features 222 | pred_type: str 223 | Specifies what is to be predicted: 224 | "samples": draws n_samples from the predicted response distribution. Output shape is (n_samples, n_obs, n_target) 225 | "parameters": returns the predicted distributional parameters. 226 | n_samples: int 227 | If pred_type="response" specifies how many samples are drawn from the predicted response distribution. 228 | Returns 229 | ------- 230 | pd.DataFrame with n_samples drawn from predicted response distribution. 231 | 232 | """ 233 | 234 | n_target = self.D 235 | n_tril = self.tril_dim(n_target) 236 | n_rho = self.rho_dim(n_target) 237 | tril_indices = torch.tril_indices(row=n_target, col=n_target, offset=0) 238 | param_dict = self.create_param_dict(n_target,tril_indices) 239 | dist_params = list(param_dict.keys()) 240 | 241 | # Predicted parameters 242 | params_predt = torch.tensor(model.predict(X_test), device="cuda") 243 | params_predt = [response_fun(params_predt[:, i]).reshape(-1,1) for i, (dist_param, response_fun) in enumerate(param_dict.items())] 244 | 245 | # Df 246 | predt_df = params_predt[0].reshape(-1,) 247 | predt_df_pd = pd.DataFrame(predt_df.cpu().detach().numpy()) 248 | predt_df_pd.columns = ["df"] 249 | 250 | # Location 251 | predt_location = torch.cat(params_predt[1:(n_target+1)],axis=1) 252 | predt_location_df = pd.DataFrame(predt_location.cpu().detach().numpy()) 253 | predt_location_df.columns = [param for param in dist_params if "location_" in param] 254 | 255 | # Tril 256 | n_obs = X_test.shape[0] 257 | tril_predt = torch.cat(params_predt[(n_target+1):],axis=1).reshape(-1, n_tril) 258 | predt_tril = torch.zeros(n_obs, n_target, n_target, dtype=tril_predt.dtype, device="cuda") 259 | predt_tril[:, tril_indices[0], tril_indices[1]] = tril_predt 260 | 261 | # Predicted Distribution 262 | mvt_pred = MultivariateStudentT(predt_df, predt_location, predt_tril) 263 | 264 | # Sigma 265 | predt_sigma = mvt_pred.stddev.cpu().detach().numpy() 266 | predt_sigma_df = pd.DataFrame(predt_sigma) 267 | predt_sigma_df.columns = [param for param in dist_params if "scale_" in param] 268 | 269 | # Rho 270 | cov_mat = mvt_pred.covariance_matrix 271 | predt_rho = torch.cat([calc_corr(cov_mat[i]).reshape(-1, n_rho) for i in range(n_obs)],axis=0) 272 | predt_rho_df = pd.DataFrame(predt_rho.cpu().detach().numpy()) 273 | predt_rho_df.columns = [param for param in dist_params if "rho_" in param] 274 | 275 | 276 | # Output DataFrame 277 | predt_params = pd.concat([predt_df_pd, predt_location_df, predt_sigma_df, predt_rho_df], axis=1) 278 | 279 | if pred_type == "parameters": 280 | return predt_params 281 | 282 | elif pred_type == "samples": 283 | torch.manual_seed(123) 284 | y_samples = mvt_pred.sample((n_samples,)).cpu().detach().numpy() 285 | return y_samples 286 | 287 | -------------------------------------------------------------------------------- /pyboostlss/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | """Py-BoostLSS - An extension of Py-Boost to probabilistic modelling""" 2 | 3 | from pyboostlss.distributions.distribution_loss_metric import * 4 | from pyboostlss.distributions.MVN import * 5 | from pyboostlss.distributions.MVN_LRA import * 6 | from pyboostlss.distributions.MVT import * 7 | from pyboostlss.distributions.DIRICHLET import * 8 | 9 | -------------------------------------------------------------------------------- /pyboostlss/distributions/distribution_loss_metric.py: -------------------------------------------------------------------------------- 1 | import cupy as cp 2 | from pyboostlss.utils import * 3 | from py_boost.gpu.losses import Loss, Metric 4 | 5 | 6 | 7 | class Distribution_Metric(Metric): 8 | 9 | 10 | def __init__(self, dist): 11 | self.dist = dist 12 | 13 | 14 | alias = "NLL-score" 15 | 16 | 17 | def error(self, y_true, y_pred): 18 | """Error metric definition. 19 | Args: 20 | y_true: cp.array, targets 21 | y_pred: cp.array, predictions 22 | sample_weight: None or cp.ndarray, weights 23 | Returns: 24 | float, metric value 25 | """ 26 | 27 | _, nll = self.dist.get_params_nll(y_true, y_pred) 28 | nll = cp.asarray(nll) 29 | 30 | return nll 31 | 32 | 33 | def compare(self, v0 ,v1): 34 | """ 35 | It should return True if v0 metric value is better than v1, False othewise 36 | """ 37 | return v0 < v1 38 | 39 | 40 | def __call__(self, y_true, y_pred, sample_weight=None): 41 | """Full metric definition. 42 | Args: 43 | y_true: cp.array, targets 44 | y_pred: cp.array, predictions 45 | sample_weight: None or cp.ndarray, weights 46 | Returns: 47 | float, metric value 48 | """ 49 | 50 | err = self.error(y_true, y_pred) 51 | 52 | return err 53 | 54 | 55 | 56 | 57 | class Distribution_Loss(Loss): 58 | 59 | def __init__(self, dist): 60 | self.dist = dist 61 | 62 | def get_grad_hess(self, y_true, y_pred): 63 | """ 64 | Defines how to calculate gradients and hessians for given loss. 65 | Args: 66 | y_true: cp.array, targets 67 | y_pred: cp.array, predictions 68 | sample_weight: None or cp.ndarray, weights 69 | Returns: 70 | floats, grad, hess 71 | """ 72 | 73 | ### 74 | # Parameters and NLL 75 | ### 76 | predt, nll = self.dist.get_params_nll(y_true, y_pred, requires_grad=True) 77 | 78 | 79 | ### 80 | # Derivatives 81 | ### 82 | grad, hess = get_derivs(nll, predt) 83 | 84 | return grad, hess 85 | 86 | 87 | 88 | def base_score(self, y_true): 89 | """ 90 | Defines how parameter estimates are initialized. 91 | Args: 92 | y_true: cp.array, targets 93 | Returns: 94 | floats, base_margins 95 | """ 96 | 97 | if hasattr(self.dist, "r"): 98 | n_target = [self.dist.D, self.dist.r] 99 | else: 100 | n_target = self.dist.D 101 | base_margin = self.dist.initialize(y_true, n_target) 102 | 103 | return base_margin -------------------------------------------------------------------------------- /pyboostlss/model.py: -------------------------------------------------------------------------------- 1 | from pyboostlss.distributions.distribution_loss_metric import * 2 | from pyboostlss.utils import * 3 | from py_boost import SketchBoost 4 | 5 | import optuna 6 | from optuna.samplers import TPESampler 7 | 8 | class PyBoostLSS: 9 | """ 10 | Py-BoostLSS model class. Currently only supports SketchBoost algorithm. 11 | 12 | """ 13 | 14 | def __init__(self, dist): 15 | self.dist = dist # pyboostlss.distributions class. Specifies distribution 16 | 17 | 18 | def train(self, 19 | dtrain=None, 20 | eval_sets=None, 21 | ntrees=100, 22 | lr=0.05, 23 | min_gain_to_split=0, 24 | lambda_l2=1, 25 | gd_steps=1, 26 | max_depth=6, 27 | min_data_in_leaf=10, 28 | colsample=1., 29 | subsample=1., 30 | 31 | quantization='Quantile', 32 | quant_sample=2000000, 33 | max_bin=256, 34 | min_data_in_bin=3, 35 | 36 | es=100, 37 | seed=123, 38 | verbose=10, 39 | 40 | sketch_outputs=1, 41 | sketch_method="proj", 42 | use_hess=True, 43 | 44 | callbacks=None, 45 | sketch_params=None): 46 | 47 | """Train a pyboostlss model with given parameters. 48 | 49 | Parameters 50 | ---------- 51 | dtrain: dict, Dataset used for training of the form {'X': X_train, 'y': X_train} 52 | eval_sets: list used to evaluate model during training, e.g., [{'X': X_train, 'y': X_train}] 53 | ntrees: int, maximum number of trees 54 | lr: float, learning rate 55 | min_gain_to_split: float >=0, minimal gain to split 56 | lambda_l2: float > 0, l2 leaf regularization 57 | gd_steps: int > 0, number of gradient steps 58 | max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets 59 | min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated 60 | with hessian values to speed up training 61 | colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling 62 | subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling 63 | quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform', 64 | 'Uniquant' or custom implementation 65 | quant_sample: int, subsample to quantize features 66 | max_bin: int in [2, 256] maximum number of bins to quantize features 67 | min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored 68 | es: int, early stopping rounds. If 0, no early stopping 69 | seed: int, random state 70 | verbose: int, verbosity freq 71 | sketch_outputs: int, number of outputs to keep 72 | sketch_method: str, name of the sketching strategy. Currently the following options are available: "topk", "rand", "proj". 73 | use_hess: bool, use hessians in multioutput training 74 | callbacks: list of Callback, callbacks to customize training are passed here 75 | sketch_params: dict, optional kwargs for sketching strategy 76 | 77 | """ 78 | 79 | bstLSS_init = SketchBoost(loss=Distribution_Loss(self.dist), 80 | metric=Distribution_Metric(self.dist), 81 | ntrees=ntrees, 82 | lr=lr, 83 | min_gain_to_split=min_gain_to_split, 84 | lambda_l2=lambda_l2, 85 | gd_steps=gd_steps, 86 | max_depth=max_depth, 87 | min_data_in_leaf=min_data_in_leaf, 88 | colsample=colsample, 89 | subsample=subsample, 90 | 91 | quantization=quantization, 92 | quant_sample=quant_sample, 93 | max_bin=max_bin, 94 | min_data_in_bin=min_data_in_bin, 95 | 96 | es=es, 97 | seed=seed, 98 | verbose=verbose, 99 | 100 | sketch_outputs=sketch_outputs, 101 | sketch_method=sketch_method, 102 | use_hess=use_hess, 103 | 104 | callbacks=callbacks, 105 | sketch_params=sketch_params 106 | ) 107 | 108 | 109 | # Append Target 110 | if hasattr(self.dist, "r"): 111 | n_target = [self.dist.D, self.dist.r] 112 | else: 113 | n_target = self.dist.D 114 | 115 | y_train_append = self.dist.target_append(dtrain["y"], self.dist.n_dist_param(n_target)) 116 | 117 | if eval_sets is not None: 118 | y_eval_append = self.dist.target_append(eval_sets[0]["y"] , self.dist.n_dist_param(n_target)) 119 | eval_sets_append = eval_sets.copy() 120 | eval_sets_append[0]["y"] = y_eval_append 121 | 122 | else: 123 | eval_sets_append = None 124 | 125 | 126 | bstLSS_train = bstLSS_init.fit(dtrain["X"], y_train_append, eval_sets=eval_sets_append) 127 | 128 | return bstLSS_train 129 | 130 | 131 | 132 | 133 | 134 | 135 | def hyper_opt(self, 136 | params=None, 137 | dtrain=None, 138 | eval_sets=None, 139 | ntrees=100, 140 | lr=0.05, 141 | min_gain_to_split=0, 142 | lambda_l2=1, 143 | gd_steps=1, 144 | max_depth=6, 145 | min_data_in_leaf=10, 146 | colsample=1., 147 | subsample=1., 148 | 149 | quantization='Quantile', 150 | quant_sample=2000000, 151 | max_bin=256, 152 | min_data_in_bin=3, 153 | 154 | es=100, 155 | seed=123, 156 | hp_seed=None, 157 | verbose=int(1e04), 158 | 159 | sketch_outputs=1, 160 | sketch_method="proj", 161 | use_hess=True, 162 | 163 | callbacks=None, 164 | sketch_params=None, 165 | 166 | max_minutes=120, 167 | n_trials=None, 168 | study_name=None, 169 | silence=False 170 | ): 171 | 172 | """Function to tune hyper-parameters using Optuna. 173 | 174 | Parameters 175 | ---------- 176 | params: dict, tunable hyper-parameters and their ranges 177 | dtrain: dict, Dataset used for training of the form {'X': X_train, 'y': X_train} 178 | eval_sets: list used to evaluate model during training, e.g., [{'X': X_train, 'y': X_train}] 179 | ntrees: int, maximum number of trees 180 | lr: float, learning rate 181 | min_gain_to_split: float >=0, minimal gain to split 182 | lambda_l2: float > 0, l2 leaf regularization 183 | gd_steps: int > 0, number of gradient steps 184 | max_depth: int > 0, maximum tree depth. Setting it to large values (>12) may cause OOM for wide datasets 185 | min_data_in_leaf: int, minimal leaf size. Note - for some loss fn leaf size is approximated 186 | with hessian values to speed up training 187 | colsample: float or Callable, sumsample of columns to construct trees or callable - custom sampling 188 | subsample: float or Callable, sumsample of rows to construct trees or callable - custom sampling 189 | quantization: str or Quantizer, method for quantizatrion. One of 'Quantile', 'Uniform', 190 | 'Uniquant' or custom implementation 191 | quant_sample: int, subsample to quantize features 192 | max_bin: int in [2, 256] maximum number of bins to quantize features 193 | min_data_in_bin: int in [2, 256] minimal bin size. NOTE: currently ignored 194 | es: int, early stopping rounds. If 0, no early stopping 195 | seed: int, random state 196 | hp_seed: int, Random state for random number generator used in the Bayesian hyper-parameter search 197 | verbose: int, verbosity freq 198 | sketch_outputs: int, number of outputs to keep 199 | sketch_method: str, name of the sketching strategy. Currently the following options are available: "topk", "rand", "proj". 200 | use_hess: bool, use hessians in multioutput training 201 | callbacks: list of Callback, callbacks to customize training are passed here 202 | sketch_params: dict, optional kwargs for sketching strategy 203 | max_minutes: int, Time budget in minutes, i.e., stop study after the given number of minutes. 204 | n_trials: int, The number of trials. If this argument is set to None, there is no limitation on the number of trials. 205 | study_name : str, Name of the hyperparameter study. 206 | silence: bool, Controls the verbosity of the trail, i.e., user can silence the outputs of the trail. 207 | 208 | Returns 209 | ------- 210 | opt_params : Dict() with optimal parameters. 211 | """ 212 | 213 | def objective(trial): 214 | 215 | hyper_params = { 216 | "lr": trial.suggest_float("lr", params["lr"][0], params["lr"][1]), 217 | "max_depth": trial.suggest_int("max_depth", params["max_depth"][0], params["max_depth"][1]), 218 | "sketch_outputs": trial.suggest_int("sketch_outputs", params["sketch_outputs"][0], params["sketch_outputs"][1]), 219 | "lambda_l2": trial.suggest_float("lambda_l2", params["lambda_l2"][0], params["lambda_l2"][1]), 220 | "colsample": trial.suggest_float("colsample", params["colsample"][0], params["colsample"][1]), 221 | "subsample": trial.suggest_float("subsample", params["subsample"][0], params["subsample"][1]), 222 | "min_gain_to_split": trial.suggest_float("min_gain_to_split", params["min_gain_to_split"][0], params["min_gain_to_split"][1]) 223 | } 224 | 225 | bstLSS_cv = self.train(dtrain=dtrain, 226 | eval_sets=eval_sets, 227 | ntrees=ntrees, 228 | lr=hyper_params["lr"], 229 | min_gain_to_split=hyper_params["min_gain_to_split"], 230 | lambda_l2=hyper_params["lambda_l2"], 231 | gd_steps=gd_steps, 232 | max_depth=hyper_params["max_depth"], 233 | min_data_in_leaf=min_data_in_leaf, 234 | colsample=hyper_params["colsample"], 235 | subsample=hyper_params["subsample"], 236 | 237 | quantization=quantization, 238 | quant_sample=quant_sample, 239 | max_bin=max_bin, 240 | min_data_in_bin=min_data_in_bin, 241 | 242 | es=es, 243 | seed=seed, 244 | verbose=verbose, 245 | 246 | sketch_outputs=hyper_params["sketch_outputs"], 247 | sketch_method=sketch_method, 248 | use_hess=use_hess, 249 | 250 | callbacks=callbacks, 251 | sketch_params=sketch_params 252 | ) 253 | 254 | 255 | # Add optimal rounds 256 | opt_rounds = bstLSS_cv.best_round 257 | trial.set_user_attr("opt_round", int(opt_rounds)) 258 | 259 | # Extract the best score 260 | y_true = eval_sets[0]["y"] 261 | y_pred = bstLSS_cv.predict(eval_sets[0]["X"]) 262 | _, nll = self.dist.get_params_nll(y_true, y_pred) 263 | best_score = cp.asarray(nll) 264 | 265 | # Replace 0 value 266 | best_score = cp.where(best_score == -0.0, 1e08, best_score) 267 | 268 | return best_score 269 | 270 | 271 | if silence: 272 | optuna.logging.set_verbosity(optuna.logging.WARNING) 273 | 274 | if study_name is None: 275 | study_name = "Py-BoostLSS Hyper-Parameter Optimization" 276 | 277 | if hp_seed is not None: 278 | sampler = TPESampler(seed=hp_seed) 279 | else: 280 | sampler = TPESampler() 281 | 282 | pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=20) 283 | study = optuna.create_study(sampler=sampler, pruner=pruner, direction="minimize", study_name=study_name) 284 | study.optimize(objective, n_trials=n_trials, timeout=60 * max_minutes, show_progress_bar=True) 285 | 286 | 287 | print("\nHyper-Parameter Optimization successfully finished.") 288 | print(" Number of finished trials: ", len(study.trials)) 289 | print(" Best trial:") 290 | opt_param = study.best_trial 291 | 292 | # Add optimal stopping round 293 | opt_param.params["opt_rounds"] = study.trials_dataframe()["user_attrs_opt_round"][ 294 | study.trials_dataframe()["value"].idxmin()] 295 | opt_param.params["opt_rounds"] = int(opt_param.params["opt_rounds"]) 296 | 297 | print(" Value: {}".format(opt_param.value)) 298 | print(" Params: ") 299 | for key, value in opt_param.params.items(): 300 | print(" {}: {}".format(key, value)) 301 | 302 | return opt_param.params 303 | -------------------------------------------------------------------------------- /pyboostlss/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import grad as autograd 3 | import cupy as cp 4 | import numpy as np 5 | 6 | ### 7 | # Response Functions 8 | ### 9 | 10 | def identity_fn(predt: torch.tensor) -> torch.tensor: 11 | """Identity mapping of predt. 12 | """ 13 | return predt 14 | 15 | 16 | def exp_fn(predt: torch.tensor) -> torch.tensor: 17 | """Exp() function used to ensure predt is strictly positive. 18 | """ 19 | predt_adj = torch.exp(predt) 20 | predt_adj = torch.nan_to_num(predt_adj, nan=float(torch.nanmean(predt_adj))) + torch.tensor(1e-6, dtype=predt_adj.dtype, device="cuda") 21 | 22 | return predt_adj 23 | 24 | 25 | def exp_fn_df(predt: torch.tensor) -> torch.tensor: 26 | """Exp() function for StudentT df-paramter used to ensure predt is strictly positive. 27 | """ 28 | predt_adj = torch.exp(predt) + torch.tensor(2.0, device="cuda") 29 | predt_adj = torch.nan_to_num(predt_adj, nan=float(torch.nanmean(predt_adj))) + torch.tensor(1e-6, dtype=predt_adj.dtype, device="cuda") 30 | 31 | return predt_adj 32 | 33 | 34 | 35 | 36 | ### 37 | # Autograd Function 38 | ### 39 | def get_derivs(nll: torch.tensor, predt: torch.tensor) -> cp.ndarray: 40 | """ Calculates gradients and hessians. 41 | 42 | Output gradients and hessians have shape (n_samples, n_outputs). 43 | 44 | Args: 45 | nll: torch.tensor, calculated NLL 46 | predt: torch.tensor, list of predicted paramters 47 | 48 | Returns: 49 | grad, hess 50 | """ 51 | 52 | # Gradient and Hessian 53 | grad_list = autograd(nll, inputs=predt, create_graph=True) 54 | hess_list = [autograd(grad_list[i].nansum(), inputs=predt[i], retain_graph=True)[0] for i in range(len(grad_list))] 55 | 56 | # Reshape 57 | grad = cp.asarray(torch.concat(grad_list,axis=1).detach()) 58 | hess = cp.asarray(torch.concat(hess_list,axis=1).detach()) 59 | 60 | return grad, hess 61 | 62 | 63 | 64 | 65 | ### 66 | # Misc 67 | ### 68 | 69 | def response_dim(y_true: int) -> int: 70 | """Infers the number of targets from input dataset. 71 | """ 72 | n_obs = y_true.shape[0] 73 | col_sums = y_true.sum(axis=0) 74 | n_target = col_sums != n_obs 75 | n_target = len(n_target[n_target == True]) 76 | 77 | return n_target 78 | 79 | 80 | def calc_corr(cov_mat: torch.tensor) -> torch.tensor: 81 | """Calculates the lower correlation matrix from covariance matrix. 82 | """ 83 | diag = torch.sqrt(torch.diag(torch.diag(cov_mat))) 84 | diag_inv = torch.linalg.inv(diag) 85 | cor_mat = diag_inv @ cov_mat @ diag_inv 86 | cor_mat = cor_mat[np.tril_indices_from(cor_mat, k=-1)] 87 | 88 | return cor_mat 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="pyboostlss", 6 | version="0.1.0", 7 | description="Py-BoostLSS: An extension of Py-Boost to probabilistic modelling", 8 | long_description=open("README.md").read(), 9 | long_description_content_type="text/markdown", 10 | author="Alexander März", 11 | author_email="alex.maerz@gmx.net", 12 | url="https://github.com/StatMixedML/Py-BoostLSS", 13 | license="Apache License 2.0", 14 | packages=find_packages(exclude=["tests"]), 15 | include_package_data=True, 16 | package_data={'': ['datasets/*.csv']}, 17 | zip_safe=True, 18 | python_requires=">=3.8, <3.10", 19 | install_requires=[ 20 | "py-boost~=0.3.0", 21 | "optuna~=3.0.3", 22 | "pyro-ppl~=1.8.3", 23 | "dirichlet~=0.9", 24 | "scikit-learn~=1.1.3", 25 | "numpy~=1.23.5", 26 | "pandas~=1.5.2", 27 | "plotnine~=0.10.1", 28 | "scipy~=1.8.1", 29 | "tqdm~=4.64.1", 30 | "matplotlib~=3.6.2", 31 | "ipywidgets~=8.0.2", 32 | ], 33 | test_suite="tests", 34 | tests_require=["flake8", "pytest"], 35 | ) --------------------------------------------------------------------------------