├── .github └── pull_request_template.md ├── .gitignore ├── .travis.yml ├── AUTHORS.txt ├── CHANGELOG.md ├── CODE_OF_CONDUCT.rst ├── CONTRIBUTING.rst ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── _static │ ├── LR_ELBO.png │ ├── LR_data.png │ ├── LR_summary.png │ └── LR_traceplot.png ├── api.rst ├── api │ ├── modules.rst │ ├── pymc3_models.models.rst │ └── pymc3_models.rst ├── conf.py ├── examples.rst ├── getting_started.rst ├── index.rst └── intro.rst ├── notebooks ├── HierarchicalLogisticRegression.ipynb ├── LinearRegression.ipynb ├── LogisticRegression.ipynb ├── NaiveBayes.ipynb └── figures │ └── naive_bayes │ ├── naive_bayes.pdf │ └── naive_bayes.png ├── pymc3_models ├── __init__.py ├── _version.py ├── exc.py ├── models │ ├── HierarchicalLogisticRegression.py │ ├── LinearRegression.py │ ├── LogisticRegression.py │ ├── NaiveBayes.py │ └── __init__.py └── utils.py ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py └── models ├── __init__.py ├── test_BayesianModel.py ├── test_HierarchicalLogisticRegression.py ├── test_LinearRegression.py ├── test_LogisticRegression.py └── test_NaiveBayes.py /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Pull Request Checklist 2 | - [ ] Linter passes locally 3 | - [ ] All tests in the `tests` folder pass with a local build 4 | - [ ] CHANGELOG has been updated 5 | - [ ] Version in `_version.py` has been updated 6 | - [ ] README has been updated (if applicable) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | #Apple 104 | .DS_Store 105 | 106 | # Pickles 107 | notebooks/pickle_jar/ 108 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | install: 6 | - pip install --upgrade pip 7 | - pip install -r requirements.txt 8 | - pip install -r requirements-dev.txt 9 | script: 10 | - flake8 pymc3_models tests 11 | - pytest -v 12 | branches: 13 | only: 14 | - master 15 | -------------------------------------------------------------------------------- /AUTHORS.txt: -------------------------------------------------------------------------------- 1 | Nicole Carlson , 2 | Rémi Louf , 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). 5 | 6 | ## [2.1.0] - 2019-01-11 7 | ### Added 8 | - Logistic Regression model 9 | 10 | ## [2.0.0] - 2019-01-11 11 | ### Changed 12 | - num_advi_sample_draws can be input to fit methods instead of hardcoded into class 13 | - num_ppc_samples can be input to predict methods (where applicable) 14 | ### Fixed 15 | - Made formatting in docstrings consistent 16 | 17 | ## [1.4.0] - 2019-01-10 18 | ### Added 19 | - Gaussian Naive Bayes model 20 | 21 | ## [1.3.0] - 2019-01-08 22 | ### Added 23 | - Travis CI integration 24 | ### Changed 25 | - Switch to using pytest for unittests 26 | 27 | ## [1.2.2] - 2019-01-07 28 | ### Fixed 29 | - License switched to classifier instead of text 30 | 31 | ## [1.2.1] - 2019-01-07 32 | ### Fixed 33 | - Missing comma in setup.py 34 | 35 | ## [1.2.0] - 2019-01-07 36 | ### Added 37 | - flake8 linting 38 | - version file 39 | - less strict version requirements in requirements.txt 40 | - PR template 41 | 42 | ## [1.1.3] - 2018-05-25 43 | ### Fixed 44 | - HLR fit method sets shared vars if no minibatch_size given 45 | 46 | ## [1.1.2] - 2018-05-20 47 | ### Fixed 48 | - df_summary deprecated in pymc3 release 3.3, changed to summary 49 | 50 | ## [1.1.1] - 2018-05-20 51 | ### Fixed 52 | - Minibatches for ADVI in HLR require model_output to be cast as int 53 | 54 | ## [1.1.0] - 2018-01-30 55 | ### Added 56 | - New class property for default number of draws for advi sampling 57 | 58 | ## [1.0.3] - 2018-01-05 59 | ### Fixed 60 | - LICENSE file name changed to correct version 61 | - Had to skip 1.0.2 due to PyPi uploading fiasco 62 | 63 | ## [1.0.1] - 2018-01-05 64 | ### Fixed 65 | - Messed up uploading to PyPi 66 | 67 | ## [1.0.0] - 2018-01-05 68 | ### Added 69 | - First version of the library 70 | - Hierarchical Logistic Regression and Linear Regression models 71 | - Documentation 72 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | CONTRIBUTOR CODE OF CONDUCT 3 | ============================ 4 | 5 | As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. 8 | 9 | Examples of unacceptable behavior by participants include: 10 | 11 | The use of sexualized language or imagery 12 | Personal attacks 13 | Trolling or insulting/derogatory comments 14 | Public or private harassment 15 | Publishing other’s private information, such as physical or electronic addresses, without explicit permission 16 | Other unethical or unprofessional conduct 17 | 18 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. 19 | 20 | This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. 21 | 22 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers (nicole@parsingscience.com). 23 | 24 | This Code of Conduct is adapted from the Contributor Covenant, version 1.2.0, available at https://www.contributor-covenant.org/version/1/2/0/code-of-conduct.html. 25 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ================ 3 | 4 | Thank you for considering contributing to PyMC3 Models! This project is intended to be a space where anyone can share models they've built. 5 | 6 | Please read these guidelines before submitting anything to the project. As of the first release, I'm the only person working on this project so respecting these guidelines will help me get back to you more quickly. 7 | 8 | Some ways to contribute: 9 | 10 | - Open an issue on the `Github Issue Tracker `__. (Please check that it has not already been reported or addressed in a PR.) 11 | - Improve the docs! 12 | - Add a new model. Please follow the guidelines below. 13 | - Add/change existing functionality in the base model class 14 | - Something I haven't thought of? 15 | 16 | Pull Requests 17 | ------------------ 18 | To create a PR against this library, please fork the project and work from there. 19 | 20 | Steps 21 | ++++++ 22 | 23 | 1. Fork the project via the Fork button on Github 24 | 25 | 2. Clone the repo to your local disk. 26 | 27 | 3. Create a new branch for your PR. 28 | 29 | :: 30 | 31 | git checkout -b my-awesome-new-feature 32 | 33 | 4. Install requirements (probably in a virtual environment) 34 | 35 | :: 36 | 37 | virtualenv venv 38 | source venv/bin/activate 39 | pip install -r requirements-dev.txt 40 | pip install -r requirements.txt 41 | 42 | 5. Develop your feature 43 | 44 | 6. Submit a PR! 45 | 46 | PR Checklist 47 | +++++++++++++ 48 | 49 | - Ensure your code has followed the Style Guidelines below 50 | - Run the linter on your code 51 | 52 | :: 53 | 54 | source venv/bin/activate 55 | flake8 pymc3_models tests 56 | 57 | - Make sure you have written unittests where appropriate 58 | - Make sure the unittests pass 59 | 60 | :: 61 | 62 | source venv/bin/activate 63 | pytest -v 64 | 65 | - Update the docs where appropriate. You can rebuild them with the commands below. 66 | 67 | :: 68 | 69 | cd pymc3_models/docs 70 | sphinx-apidoc -f -o api/ ../pymc3_models/ 71 | make html 72 | 73 | - Update the CHANGELOG 74 | 75 | Notes for new models 76 | ++++++++++++++++++++++++++ 77 | 78 | - New models should be put into the models directory. 79 | - Make the file name the same as the class name; be explicit, e.g. HierarchicalLogisticRegression, not HLR. 80 | - Try to write some simple unittests for your model. I do not recommend using NUTS in your unittests if you have a complex model because the tests will take hours to run. 81 | - [Optional] Please create a Jupyter notebook in the notebooks folder with the same name as your model class. In it, show a simple example of how to use your model. Synthetic data is fine to use. 82 | 83 | Style Guidelines 84 | ++++++++++++++++++++++++++ 85 | 86 | For the most part, this library follows PEP8 with a couple of exceptions. 87 | 88 | Notes: 89 | 90 | - Indent with 4 spaces 91 | - Lines can be 110 characters long 92 | - Docstrings should be written as numpy docstrings 93 | - Your code should be Python 3 compatible 94 | - When in doubt, follow the style of the existing code 95 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.txt LICENSE README.rst 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyMC3 Models 2 | 3 | Custom PyMC3 models built on top of the scikit-learn API. Check out the [docs](http://pymc3-models.readthedocs.io/). 4 | 5 | ## Features 6 | 7 | - Reusable PyMC3 models including LinearRegression and HierarchicalLogisticRegression 8 | - A base class, BayesianModel, for building your own PyMC3 models 9 | 10 | ## Installation 11 | The latest release of PyMC3 Models can be installed from PyPI using `pip`: 12 | 13 | ``` bash 14 | pip install pymc3_models 15 | ``` 16 | 17 | The current development branch of PyMC3 Models can be installed from GitHub, also using `pip`: 18 | 19 | ``` bash 20 | pip install git+https://github.com/parsing-science/pymc3_models.git 21 | ``` 22 | 23 | To run the package locally (in a virtual environment): 24 | 25 | ``` 26 | git clone https://github.com/parsing-science/pymc3_models.git 27 | cd pymc3_models 28 | virtualenv venv 29 | source venv/bin/activate 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | ## Usage 34 | Since PyMC3 Models is built on top of scikit-learn, you can use the same methods as with a scikit-learn model. 35 | 36 | ``` python 37 | from pymc3_models import LinearRegression 38 | 39 | LR = LinearRegression() 40 | LR.fit(X, Y) 41 | LR.predict(X) 42 | LR.score(X, Y) 43 | ``` 44 | 45 | ## Contribute 46 | For more info, see [CONTRIBUTING](https://github.com/parsing-science/pymc3_models/blob/master/CONTRIBUTING.rst). 47 | 48 | ### Contributor Code of Conduct 49 | 50 | Please note that this project is released with a [Contributor Code of Conduct](http://contributor-covenant.org/). By participating in this project you agree to abide by its terms. See [CODE_OF_CONDUCT](https://github.com/parsing-science/pymc3_models/blob/master/CODE_OF_CONDUCT.rst). 51 | 52 | ## Acknowledgments 53 | This library is built on top of [PyMC3](http://docs.pymc.io/) and [scikit-learn](http://scikit-learn.org). 54 | 55 | ## License 56 | [Apache License, Version 2.0](https://github.com/parsing-science/pymc3_models/blob/master/LICENSE) 57 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = PyMC3Models 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/LR_ELBO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/docs/_static/LR_ELBO.png -------------------------------------------------------------------------------- /docs/_static/LR_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/docs/_static/LR_data.png -------------------------------------------------------------------------------- /docs/_static/LR_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/docs/_static/LR_summary.png -------------------------------------------------------------------------------- /docs/_static/LR_traceplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/docs/_static/LR_traceplot.png -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | ======================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | api/modules 8 | -------------------------------------------------------------------------------- /docs/api/modules.rst: -------------------------------------------------------------------------------- 1 | pymc3_models 2 | ============ 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pymc3_models 8 | -------------------------------------------------------------------------------- /docs/api/pymc3_models.models.rst: -------------------------------------------------------------------------------- 1 | pymc3\_models.models package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | pymc3\_models.models.HierarchicalLogisticRegression module 8 | ---------------------------------------------------------- 9 | 10 | .. automodule:: pymc3_models.models.HierarchicalLogisticRegression 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pymc3\_models.models.LinearRegression module 16 | -------------------------------------------- 17 | 18 | .. automodule:: pymc3_models.models.LinearRegression 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pymc3\_models.models.NaiveBayes module 24 | -------------------------------------- 25 | 26 | .. automodule:: pymc3_models.models.NaiveBayes 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: pymc3_models.models 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/api/pymc3_models.rst: -------------------------------------------------------------------------------- 1 | pymc3\_models package 2 | ===================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | pymc3_models.models 10 | 11 | Submodules 12 | ---------- 13 | 14 | pymc3\_models.exc module 15 | ------------------------ 16 | 17 | .. automodule:: pymc3_models.exc 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: pymc3_models 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # PyMC3 Models documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Dec 28 11:42:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('pymc3_models/pymc3_models')) 23 | sys.path.insert(0, os.path.abspath('../')) 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autosummary', 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.coverage', 38 | 'sphinx.ext.doctest', 39 | 'sphinx.ext.githubpages', 40 | 'sphinx.ext.intersphinx', 41 | 'sphinx.ext.mathjax', 42 | 'sphinx.ext.napoleon', 43 | 'sphinx.ext.todo', 44 | 'sphinx.ext.viewcode', 45 | 'matplotlib.sphinxext.only_directives', 46 | 'matplotlib.sphinxext.plot_directive', 47 | 'numpydoc', 48 | ] 49 | 50 | #Napoleon settings 51 | napoleon_use_param = False 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ['_templates'] 55 | 56 | # The suffix(es) of source filenames. 57 | # You can specify multiple suffix as a list of string: 58 | # 59 | # source_suffix = ['.rst', '.md'] 60 | source_suffix = '.rst' 61 | 62 | # The master toctree document. 63 | master_doc = 'index' 64 | 65 | # General information about the project. 66 | project = 'PyMC3 Models' 67 | copyright = '2017, Nicole Carlson' 68 | author = 'Nicole Carlson' 69 | 70 | # The version info for the project you're documenting, acts as replacement for 71 | # |version| and |release|, also used in various other places throughout the 72 | # built documents. 73 | # 74 | # The short X.Y version. 75 | version = '1.0' 76 | # The full version, including alpha/beta/rc tags. 77 | release = '1.0' 78 | 79 | # The language for content autogenerated by Sphinx. Refer to documentation 80 | # for a list of supported languages. 81 | # 82 | # This is also used if you do content translation via gettext catalogs. 83 | # Usually you set "language" from the command line for these cases. 84 | language = None 85 | 86 | # List of patterns, relative to source directory, that match files and 87 | # directories to ignore when looking for source files. 88 | # This patterns also effect to html_static_path and html_extra_path 89 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 90 | 91 | # The name of the Pygments (syntax highlighting) style to use. 92 | pygments_style = 'sphinx' 93 | 94 | # If true, `todo` and `todoList` produce output, else they produce nothing. 95 | todo_include_todos = True 96 | 97 | 98 | # -- Options for HTML output ---------------------------------------------- 99 | 100 | # The theme to use for HTML and HTML Help pages. See the documentation for 101 | # a list of builtin themes. 102 | # 103 | html_theme = "sphinx_rtd_theme" 104 | 105 | # Theme options are theme-specific and customize the look and feel of a theme 106 | # further. For a list of options available for each theme, see the 107 | # documentation. 108 | # 109 | # html_theme_options = {} 110 | 111 | # Add any paths that contain custom static files (such as style sheets) here, 112 | # relative to this directory. They are copied after the builtin static files, 113 | # so a file named "default.css" will overwrite the builtin "default.css". 114 | html_static_path = ['_static'] 115 | 116 | # Custom sidebar templates, must be a dictionary that maps document names 117 | # to template names. 118 | # 119 | # This is required for the alabaster theme 120 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 121 | html_sidebars = { 122 | '**': [ 123 | 'globaltoc.html', 124 | 'relations.html', # needs 'show_related': True theme option to display 125 | 'searchbox.html', 126 | ] 127 | } 128 | 129 | 130 | # -- Options for HTMLHelp output ------------------------------------------ 131 | 132 | # Output file base name for HTML help builder. 133 | htmlhelp_basename = 'PyMC3Modelsdoc' 134 | 135 | 136 | # -- Options for LaTeX output --------------------------------------------- 137 | 138 | latex_elements = { 139 | # The paper size ('letterpaper' or 'a4paper'). 140 | # 141 | # 'papersize': 'letterpaper', 142 | 143 | # The font size ('10pt', '11pt' or '12pt'). 144 | # 145 | # 'pointsize': '10pt', 146 | 147 | # Additional stuff for the LaTeX preamble. 148 | # 149 | # 'preamble': '', 150 | 151 | # Latex figure (float) alignment 152 | # 153 | # 'figure_align': 'htbp', 154 | } 155 | 156 | # Grouping the document tree into LaTeX files. List of tuples 157 | # (source start file, target name, title, 158 | # author, documentclass [howto, manual, or own class]). 159 | latex_documents = [ 160 | (master_doc, 'PyMC3Models.tex', 'PyMC3 Models Documentation', 161 | 'Nicole Carlson', 'manual'), 162 | ] 163 | 164 | 165 | # -- Options for manual page output --------------------------------------- 166 | 167 | # One entry per manual page. List of tuples 168 | # (source start file, name, description, authors, manual section). 169 | man_pages = [ 170 | (master_doc, 'pymc3models', 'PyMC3 Models Documentation', 171 | [author], 1) 172 | ] 173 | 174 | 175 | # -- Options for Texinfo output ------------------------------------------- 176 | 177 | # Grouping the document tree into Texinfo files. List of tuples 178 | # (source start file, target name, title, author, 179 | # dir menu entry, description, category) 180 | texinfo_documents = [ 181 | (master_doc, 'PyMC3Models', 'PyMC3 Models Documentation', 182 | author, 'PyMC3Models', 'One line description of project.', 183 | 'Miscellaneous'), 184 | ] 185 | 186 | 187 | 188 | # -- Options for Epub output ---------------------------------------------- 189 | 190 | # Bibliographic Dublin Core info. 191 | epub_title = project 192 | epub_author = author 193 | epub_publisher = author 194 | epub_copyright = copyright 195 | 196 | # The unique identifier of the text. This can be a ISBN number 197 | # or the project homepage. 198 | # 199 | # epub_identifier = '' 200 | 201 | # A unique identification for the text. 202 | # 203 | # epub_uid = '' 204 | 205 | # A list of files that should not be packed into the epub file. 206 | epub_exclude_files = ['search.html'] 207 | 208 | 209 | 210 | # Example configuration for intersphinx: refer to the Python standard library. 211 | intersphinx_mapping = {'https://docs.python.org/': None} 212 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======================================== 3 | 4 | Check out the `notebooks folder `__. 5 | 6 | Currently, the following models have been implemented: 7 | 8 | - Linear Regression 9 | - Hierarchical Logistic Regression 10 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | Getting Started 2 | ======================================== 3 | 4 | This section is adapted from my `2017 PyData NYC talk `__. 5 | 6 | To demonstrate how to get started with PyMC3 Models, I'll walk through a simple Linear Regression example. First, I'll go through the example using just PyMC3. Then I'll show you the same example using PyMC3 Models. 7 | 8 | Generate Synthetic Data 9 | ------------------------ 10 | :: 11 | 12 | X = np.random.randn(1000, 1) 13 | noise = 2 * np.random.randn(1000, 1) 14 | Y = 4 * X + 3 + noise 15 | 16 | .. image:: _static/LR_data.png 17 | 18 | Fit a model with PyMC3 19 | ----------------------- 20 | 21 | Step 1: Set up the PyMC3 model 22 | +++++++++++++++++++++++++++++++++++++++ 23 | :: 24 | 25 | lin_reg_model = pm.Model() 26 | 27 | model_input = theano.shared(X) 28 | 29 | model_output = theano.shared(Y) 30 | 31 | with lin_reg_model: 32 | 33 | alpha = pm.Normal('alpha', mu=0, sd=100, shape=(1)) 34 | beta = pm.Normal('beta', mu=0, sd=100, shape=(1)) 35 | 36 | s = pm.HalfNormal('s', tau=1) 37 | 38 | mean = alpha + beta * model_input 39 | 40 | y = pm.Normal('y', mu=mean , sd=s, observed=model_output) 41 | 42 | Step 2: Infer your parameters 43 | +++++++++++++++++++++++++++++++++++++++ 44 | :: 45 | 46 | 47 | with lin_reg_model: 48 | inference = pm.ADVI() 49 | approx = pm.fit( 50 | n=20000, 51 | method=inference, 52 | more_replacements={ 53 | model_input: pm.Minibatch(X), 54 | model_output: pm.Minibatch(Y) 55 | } 56 | ) 57 | 58 | Check if minibatch ADVI converged by plotting the ELBO 59 | 60 | :: 61 | 62 | plt.plot(-inference.hist) 63 | plt.ylabel('ELBO') 64 | plt.xlabel('iteration') 65 | 66 | .. image:: _static/LR_ELBO.png 67 | 68 | Step 3: Interpret your parameters 69 | +++++++++++++++++++++++++++++++++++++++ 70 | To make things a bit easier, I draw samples from the approximation to generate a trace. 71 | 72 | :: 73 | 74 | trace = approx.sample(draws=5000) 75 | summary(trace) 76 | 77 | .. image:: _static/LR_summary.png 78 | 79 | :: 80 | 81 | traceplot(trace) 82 | 83 | .. image:: _static/LR_traceplot.png 84 | 85 | Step 4: Predict data by creating posterior predictive samples 86 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 87 | :: 88 | 89 | ppc = pm.sample_ppc( 90 | trace[1000:], 91 | model=lin_reg_model, 92 | samples=2000 93 | ) 94 | pred = ppc['y'].mean(axis=0) 95 | r2_score(Y, pred) 96 | 0.79444136879972738 97 | 98 | Fit a model with PyMC3 Models 99 | ---------------------------------------------- 100 | Now, we can build a Linear Regression model using PyMC3 models. 101 | 102 | The following is equivalent to Steps 1 and 2 above. 103 | 104 | :: 105 | 106 | LR = LinearRegression() 107 | LR.fit(X, Y, minibatch_size=100) 108 | LR.plot_elbo() 109 | 110 | .. image:: _static/LR_ELBO.png 111 | 112 | The following is equivalent to Step 3 above. 113 | Since the trace is saved directly, you can use the same PyMC3 functions (summary and traceplot). 114 | 115 | :: 116 | 117 | traceplot(LR.trace) 118 | 119 | .. image:: _static/LR_traceplot.png 120 | 121 | The following is equivalent to Step 4. 122 | 123 | :: 124 | 125 | Y_predict = LR.predict(X) 126 | LR.score(X, Y) 127 | 128 | The same type of model can be fit in fewer lines, and the model class follows the scikit-learn API. 129 | 130 | If you want a point estimate, you can use the saved summary dataframe: 131 | 132 | :: 133 | 134 | beta = LR.summary['mean']['betas__0_0'] 135 | alpha = LR.summary['mean']['alpha__0'] 136 | 137 | Advanced 138 | ---------- 139 | Saving and Loading your model 140 | ++++++++++++++++++++++++++++++++++++++++++++++++++ 141 | 142 | :: 143 | 144 | LR.save('pickle_jar/LR_jar/') 145 | LR2 = LinearRegression() 146 | LR2.load('pickle_jar/LR_jar/') 147 | 148 | NUTS Inference 149 | +++++++++++++++++++++++++ 150 | The default method of inference for PyMC3 models is minibatch ADVI. This is typically much faster than other methods. However, in some cases, you may want to use the NUTS sampler. 151 | 152 | :: 153 | 154 | LR3 = LinearRegression() 155 | LR3.fit(X, Y, inference_type='nuts', inference_args={'draws': 2000}) 156 | 157 | Now you can use the predict, score methods, etc as above. 158 | 159 | Inference Args 160 | +++++++++++++++++++++++++ 161 | If you don't want to use the default arguments for inference, you can pass in inference_args. Check out the `PyMC3 documentation `__ for permissible values for the inference_type you are using. 162 | 163 | Building your own models 164 | +++++++++++++++++++++++++ 165 | Lastly, if you want to build your own models, you can build them on top of the BayesianModel base class. 166 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to PyMC3 Models's documentation! 2 | ======================================== 3 | 4 | .. include:: ../README.rst 5 | 6 | Contents 7 | --------- 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | intro 13 | getting_started 14 | examples 15 | api 16 | 17 | Indices and tables 18 | ------------------ 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /docs/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction to PyMC3 models 2 | ======================================== 3 | 4 | This library was inspired by my own work creating a re-usable Hierarchical Logistic Regression model. 5 | 6 | To learn more, you can read this section, watch a 7 | `video from PyData NYC 2017 `__, or check out the 8 | `slides `__ . 9 | 10 | Quick intro to PyMC3 11 | -------------------- 12 | When building a model with PyMC3, you will usually follow the same four steps: 13 | 14 | - **Step 1: Set up** Parameterize your model, choose priors, and insert training data 15 | - **Step 2: Inference** infer your parameters using MCMC sampling (e.g. NUTS) or variational inference (e.g. ADVI) 16 | - **Step 3: Interpret** Check your parameter distributions and model fit 17 | - **Step 4: Predict data** Create posterior samples with your inferred parameters 18 | 19 | For a longer discussion of these steps, see :doc:`getting_started`. 20 | 21 | Mapping between scikit-learn and PyMC3 22 | -------------------------------------- 23 | This library builds a mapping between the steps above with the methods used by scikit-learn models. 24 | 25 | +----------------+--------------------------------------+ 26 | | scikit-learn | PyMC3 | 27 | +================+======================================+ 28 | | Fit | Step 1: Set up, Step 2: Inference | 29 | +----------------+--------------------------------------+ 30 | | Predict | Step 4: Predict Data | 31 | +----------------+--------------------------------------+ 32 | | Score | Step 4: Predict data | 33 | +----------------+--------------------------------------+ 34 | | Save/Load | ?? | 35 | +----------------+--------------------------------------+ 36 | | ?? | Step 3: Interpret | 37 | +----------------+--------------------------------------+ 38 | 39 | The question marks represent things that don't exist in the two libraries on their own. 40 | 41 | 42 | Comparing scitkit-learn, PyMC3, and PyMC3 Models 43 | ------------------------------------------------ 44 | Using the mapping above, this library creates easy to use PyMC3 models. 45 | 46 | +----------------------------+-------------+-------------+--------------+ 47 | | |scikit-learn | PyMC3 | PyMC3 models | 48 | +============================+=============+=============+==============+ 49 | | Find model parameters | Easy | Medium | Easy | 50 | +----------------------------+-------------+-------------+--------------+ 51 | | Predict new data | Easy | Difficult | Easy | 52 | +----------------------------+-------------+-------------+--------------+ 53 | | Score a model | Easy | Difficult | Easy | 54 | +----------------------------+-------------+-------------+--------------+ 55 | | Save a trained model | Easy | Impossible? | Easy | 56 | +----------------------------+-------------+-------------+--------------+ 57 | | Load a trained model | Easy | Impossible? | Easy | 58 | +----------------------------+-------------+-------------+--------------+ 59 | | Interpret Parameterization | N/A | Easy | Easy | 60 | +----------------------------+-------------+-------------+--------------+ 61 | 62 | -------------------------------------------------------------------------------- /notebooks/figures/naive_bayes/naive_bayes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/notebooks/figures/naive_bayes/naive_bayes.pdf -------------------------------------------------------------------------------- /notebooks/figures/naive_bayes/naive_bayes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/notebooks/figures/naive_bayes/naive_bayes.png -------------------------------------------------------------------------------- /pymc3_models/__init__.py: -------------------------------------------------------------------------------- 1 | from pymc3_models.models.HierarchicalLogisticRegression import HierarchicalLogisticRegression 2 | from pymc3_models.models.LinearRegression import LinearRegression 3 | from pymc3_models.models.LogisticRegression import LogisticRegression 4 | from pymc3_models.models.NaiveBayes import GaussianNaiveBayes 5 | -------------------------------------------------------------------------------- /pymc3_models/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.1.0' 2 | -------------------------------------------------------------------------------- /pymc3_models/exc.py: -------------------------------------------------------------------------------- 1 | class PyMC3ModelsError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /pymc3_models/models/HierarchicalLogisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pymc3 as pm 3 | from sklearn.metrics import accuracy_score 4 | import theano 5 | import theano.tensor as T 6 | 7 | from pymc3_models.exc import PyMC3ModelsError 8 | from pymc3_models.models import BayesianModel 9 | 10 | 11 | class HierarchicalLogisticRegression(BayesianModel): 12 | """ 13 | Custom Hierachical Logistic Regression built using PyMC3. 14 | """ 15 | 16 | def __init__(self): 17 | super(HierarchicalLogisticRegression, self).__init__() 18 | self.num_cats = None 19 | 20 | def create_model(self): 21 | """ 22 | Creates and returns the PyMC3 model. 23 | 24 | Note: The size of the shared variables must match the size of the training data. 25 | Otherwise, setting the shared variables later will raise an error. 26 | See http://docs.pymc.io/advanced_theano.html 27 | 28 | Returns 29 | ------- 30 | the PyMC3 model 31 | """ 32 | model_input = theano.shared(np.zeros([self.num_training_samples, self.num_pred])) 33 | 34 | model_output = theano.shared(np.zeros(self.num_training_samples, dtype='int')) 35 | 36 | model_cats = theano.shared(np.zeros(self.num_training_samples, dtype='int')) 37 | 38 | self.shared_vars = { 39 | 'model_input': model_input, 40 | 'model_output': model_output, 41 | 'model_cats': model_cats 42 | } 43 | 44 | model = pm.Model() 45 | 46 | with model: 47 | mu_alpha = pm.Normal('mu_alpha', mu=0, sd=100) 48 | sigma_alpha = pm.HalfNormal('sigma_alpha', sd=100) 49 | 50 | mu_beta = pm.Normal('mu_beta', mu=0, sd=100) 51 | sigma_beta = pm.HalfNormal('sigma_beta', sd=100) 52 | 53 | alpha = pm.Normal('alpha', mu=mu_alpha, sd=sigma_alpha, shape=(self.num_cats,)) 54 | betas = pm.Normal('beta', mu=mu_beta, sd=sigma_beta, shape=(self.num_cats, self.num_pred)) 55 | 56 | c = model_cats 57 | 58 | temp = alpha[c] + T.sum(betas[c] * model_input, 1) 59 | 60 | p = pm.invlogit(temp) 61 | 62 | o = pm.Bernoulli('o', p, observed=model_output) 63 | 64 | return model 65 | 66 | def fit( 67 | self, 68 | X, 69 | y, 70 | cats, 71 | inference_type='advi', 72 | num_advi_sample_draws=10000, 73 | minibatch_size=None, 74 | inference_args=None 75 | ): 76 | """ 77 | Train the Hierarchical Logistic Regression model 78 | 79 | Parameters 80 | ---------- 81 | X : numpy array 82 | shape [num_training_samples, num_pred] 83 | 84 | y : numpy array 85 | shape [num_training_samples, ] 86 | 87 | cats : numpy array 88 | shape [num_training_samples, ] 89 | 90 | inference_type : str (defaults to 'advi') 91 | specifies which inference method to call 92 | Currently, only 'advi' and 'nuts' are supported. 93 | 94 | num_advi_sample_draws : int (defaults to 10000) 95 | Number of samples to draw from ADVI approximation after it has been fit; 96 | not used if inference_type != 'advi' 97 | 98 | minibatch_size : int (defaults to None) 99 | number of samples to include in each minibatch for ADVI 100 | If None, minibatch is not run. 101 | 102 | inference_args : dict (defaults to None) 103 | arguments to be passed to the inference methods 104 | Check the PyMC3 docs for permissable values. 105 | If None, default values will be set. 106 | """ 107 | self.num_cats = len(np.unique(cats)) 108 | self.num_training_samples, self.num_pred = X.shape 109 | 110 | self.inference_type = inference_type 111 | 112 | if y.ndim != 1: 113 | y = np.squeeze(y) 114 | 115 | if not inference_args: 116 | inference_args = self._set_default_inference_args() 117 | 118 | if self.cached_model is None: 119 | self.cached_model = self.create_model() 120 | 121 | if minibatch_size: 122 | with self.cached_model: 123 | minibatches = { 124 | self.shared_vars['model_input']: pm.Minibatch(X, batch_size=minibatch_size), 125 | self.shared_vars['model_output']: pm.Minibatch(y, batch_size=minibatch_size), 126 | self.shared_vars['model_cats']: pm.Minibatch(cats, batch_size=minibatch_size) 127 | } 128 | 129 | inference_args['more_replacements'] = minibatches 130 | else: 131 | self._set_shared_vars({ 132 | 'model_input': X, 133 | 'model_output': y, 134 | 'model_cats': cats 135 | }) 136 | 137 | self._inference(inference_type, inference_args, num_advi_sample_draws=num_advi_sample_draws) 138 | 139 | return self 140 | 141 | def predict_proba(self, X, cats, return_std=False, num_ppc_samples=2000): 142 | """ 143 | Predicts probabilities of new data with a trained Hierarchical Logistic Regression 144 | 145 | Parameters 146 | ---------- 147 | X : numpy array 148 | shape [num_training_samples, num_pred] 149 | 150 | cats : numpy array 151 | shape [num_training_samples, ] 152 | 153 | return_std : bool (defaults to False) 154 | Flag of whether to return standard deviations with mean probabilities 155 | 156 | num_ppc_samples : int (defaults to 2000) 157 | 'samples' parameter passed to pm.sample_ppc 158 | """ 159 | 160 | if self.trace is None: 161 | raise PyMC3ModelsError('Run fit on the model before predict.') 162 | 163 | num_samples = X.shape[0] 164 | 165 | if self.cached_model is None: 166 | self.cached_model = self.create_model() 167 | 168 | self._set_shared_vars({ 169 | 'model_input': X, 170 | 'model_output': np.zeros(num_samples, dtype='int'), 171 | 'model_cats': cats 172 | }) 173 | 174 | ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=num_ppc_samples) 175 | 176 | if return_std: 177 | return ppc['o'].mean(axis=0), ppc['o'].std(axis=0) 178 | else: 179 | return ppc['o'].mean(axis=0) 180 | 181 | def predict(self, X, cats, num_ppc_samples=2000): 182 | """ 183 | Predicts labels of new data with a trained model 184 | 185 | Parameters 186 | ---------- 187 | X : numpy array 188 | shape [num_training_samples, num_pred] 189 | 190 | cats : numpy array 191 | shape [num_training_samples, ] 192 | 193 | num_ppc_samples : int (defaults to 2000) 194 | 'samples' parameter passed to pm.sample_ppc 195 | """ 196 | ppc_mean = self.predict_proba(X, cats, num_ppc_samples=2000) 197 | 198 | pred = ppc_mean > 0.5 199 | 200 | return pred 201 | 202 | def score(self, X, y, cats, num_ppc_samples=2000): 203 | """ 204 | Scores new data with a trained model with sklearn's accuracy_score. 205 | 206 | Parameters 207 | ---------- 208 | X : numpy array 209 | shape [num_training_samples, num_pred] 210 | 211 | y : numpy array 212 | shape [num_training_samples, ] 213 | 214 | cats : numpy array 215 | shape [num_training_samples, ] 216 | 217 | num_ppc_samples : int (defaults to 2000) 218 | 'samples' parameter passed to pm.sample_ppc 219 | """ 220 | 221 | return accuracy_score(y, self.predict(X, cats, num_ppc_samples=num_ppc_samples)) 222 | 223 | def save(self, file_prefix): 224 | params = { 225 | 'inference_type': self.inference_type, 226 | 'num_cats': self.num_cats, 227 | 'num_pred': self.num_pred, 228 | 'num_training_samples': self.num_training_samples 229 | } 230 | 231 | super(HierarchicalLogisticRegression, self).save(file_prefix, params) 232 | 233 | def load(self, file_prefix): 234 | params = super(HierarchicalLogisticRegression, self).load(file_prefix, load_custom_params=True) 235 | 236 | self.inference_type = params['inference_type'] 237 | self.num_cats = params['num_cats'] 238 | self.num_pred = params['num_pred'] 239 | self.num_training_samples = params['num_training_samples'] 240 | -------------------------------------------------------------------------------- /pymc3_models/models/LinearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pymc3 as pm 3 | from sklearn.metrics import r2_score 4 | import theano 5 | import theano.tensor as T 6 | 7 | from pymc3_models.exc import PyMC3ModelsError 8 | from pymc3_models.models import BayesianModel 9 | 10 | 11 | class LinearRegression(BayesianModel): 12 | """ 13 | Linear Regression built using PyMC3. 14 | """ 15 | 16 | def __init__(self): 17 | super(LinearRegression, self).__init__() 18 | 19 | def create_model(self): 20 | """ 21 | Creates and returns the PyMC3 model. 22 | 23 | Note: The size of the shared variables must match the size of the training data. 24 | Otherwise, setting the shared variables later will raise an error. 25 | See http://docs.pymc.io/advanced_theano.html 26 | 27 | Returns 28 | ------- 29 | the PyMC3 model 30 | """ 31 | model_input = theano.shared(np.zeros([self.num_training_samples, self.num_pred])) 32 | 33 | model_output = theano.shared(np.zeros(self.num_training_samples)) 34 | 35 | self.shared_vars = { 36 | 'model_input': model_input, 37 | 'model_output': model_output, 38 | } 39 | 40 | model = pm.Model() 41 | 42 | with model: 43 | alpha = pm.Normal('alpha', mu=0, sd=100, shape=(1)) 44 | betas = pm.Normal('betas', mu=0, sd=100, shape=(1, self.num_pred)) 45 | 46 | s = pm.HalfNormal('s', tau=1) 47 | 48 | mean = alpha + T.sum(betas * model_input, 1) 49 | 50 | y = pm.Normal('y', mu=mean, sd=s, observed=model_output) 51 | 52 | return model 53 | 54 | def fit( 55 | self, 56 | X, 57 | y, 58 | inference_type='advi', 59 | num_advi_sample_draws=10000, 60 | minibatch_size=None, 61 | inference_args=None, 62 | ): 63 | """ 64 | Train the Linear Regression model 65 | 66 | Parameters 67 | ---------- 68 | X : numpy array 69 | shape [num_training_samples, num_pred] 70 | 71 | y : numpy array 72 | shape [num_training_samples, ] 73 | 74 | inference_type : str (defaults to 'advi') 75 | specifies which inference method to call 76 | Currently, only 'advi' and 'nuts' are supported. 77 | 78 | num_advi_sample_draws : int (defaults to 10000) 79 | Number of samples to draw from ADVI approximation after it has been fit; 80 | not used if inference_type != 'advi' 81 | 82 | minibatch_size : int (defaults to None) 83 | number of samples to include in each minibatch for ADVI 84 | If None, minibatch is not run. 85 | 86 | inference_args : dict (defaults to None) 87 | arguments to be passed to the inference methods. 88 | Check the PyMC3 docs for permissable values. 89 | If None, default values will be set. 90 | """ 91 | self.num_training_samples, self.num_pred = X.shape 92 | 93 | self.inference_type = inference_type 94 | 95 | if y.ndim != 1: 96 | y = np.squeeze(y) 97 | 98 | if not inference_args: 99 | inference_args = self._set_default_inference_args() 100 | 101 | if self.cached_model is None: 102 | self.cached_model = self.create_model() 103 | 104 | if minibatch_size: 105 | with self.cached_model: 106 | minibatches = { 107 | self.shared_vars['model_input']: pm.Minibatch(X, batch_size=minibatch_size), 108 | self.shared_vars['model_output']: pm.Minibatch(y, batch_size=minibatch_size), 109 | } 110 | 111 | inference_args['more_replacements'] = minibatches 112 | else: 113 | self._set_shared_vars({'model_input': X, 'model_output': y}) 114 | 115 | self._inference(inference_type, inference_args, num_advi_sample_draws=num_advi_sample_draws) 116 | 117 | return self 118 | 119 | def predict(self, X, return_std=False, num_ppc_samples=2000): 120 | """ 121 | Predicts values of new data with a trained Linear Regression model 122 | 123 | Parameters 124 | ---------- 125 | X : numpy array 126 | shape [num_training_samples, num_pred] 127 | 128 | return_std : bool (defaults to False) 129 | flag of whether to return standard deviations with mean values 130 | 131 | num_ppc_samples : int (defaults to 2000) 132 | 'samples' parameter passed to pm.sample_ppc 133 | """ 134 | 135 | if self.trace is None: 136 | raise PyMC3ModelsError('Run fit on the model before predict.') 137 | 138 | num_samples = X.shape[0] 139 | 140 | if self.cached_model is None: 141 | self.cached_model = self.create_model() 142 | 143 | self._set_shared_vars({'model_input': X, 'model_output': np.zeros(num_samples)}) 144 | 145 | ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=num_ppc_samples) 146 | 147 | if return_std: 148 | return ppc['y'].mean(axis=0), ppc['y'].std(axis=0) 149 | else: 150 | return ppc['y'].mean(axis=0) 151 | 152 | def score(self, X, y, num_ppc_samples=2000): 153 | """ 154 | Scores new data with a trained model with sklearn's r2_score. 155 | 156 | Parameters 157 | ---------- 158 | X : numpy array 159 | shape [num_training_samples, num_pred] 160 | 161 | y : numpy array 162 | shape [num_training_samples, ] 163 | 164 | num_ppc_samples : int (defaults to 2000) 165 | 'samples' parameter passed to pm.sample_ppc 166 | """ 167 | 168 | return r2_score(y, self.predict(X, num_ppc_samples=num_ppc_samples)) 169 | 170 | def save(self, file_prefix): 171 | params = { 172 | 'inference_type': self.inference_type, 173 | 'num_pred': self.num_pred, 174 | 'num_training_samples': self.num_training_samples 175 | } 176 | 177 | super(LinearRegression, self).save(file_prefix, params) 178 | 179 | def load(self, file_prefix): 180 | params = super(LinearRegression, self).load(file_prefix, load_custom_params=True) 181 | 182 | self.inference_type = params['inference_type'] 183 | self.num_pred = params['num_pred'] 184 | self.num_training_samples = params['num_training_samples'] 185 | -------------------------------------------------------------------------------- /pymc3_models/models/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pymc3 as pm 3 | from sklearn.metrics import accuracy_score 4 | import theano 5 | import theano.tensor as T 6 | 7 | from pymc3_models.exc import PyMC3ModelsError 8 | from pymc3_models.models import BayesianModel 9 | 10 | 11 | class LogisticRegression(BayesianModel): 12 | """ 13 | Logistic Regression built using PyMC3. 14 | """ 15 | 16 | def __init__(self): 17 | super(LogisticRegression, self).__init__() 18 | 19 | def create_model(self): 20 | """ 21 | Creates and returns the PyMC3 model. 22 | 23 | Note: The size of the shared variables must match the size of the training data. 24 | Otherwise, setting the shared variables later will raise an error. 25 | See http://docs.pymc.io/advanced_theano.html 26 | 27 | Returns 28 | ------- 29 | the PyMC3 model 30 | """ 31 | model_input = theano.shared(np.zeros([self.num_training_samples, self.num_pred])) 32 | 33 | model_output = theano.shared(np.zeros(self.num_training_samples, dtype='int')) 34 | 35 | self.shared_vars = { 36 | 'model_input': model_input, 37 | 'model_output': model_output, 38 | } 39 | 40 | model = pm.Model() 41 | 42 | with model: 43 | alpha = pm.Normal('alpha', mu=0, sd=100, shape=(1)) 44 | betas = pm.Normal('betas', mu=0, sd=100, shape=(1, self.num_pred)) 45 | 46 | temp = alpha + T.sum(betas * model_input, 1) 47 | 48 | p = pm.invlogit(temp) 49 | 50 | o = pm.Bernoulli('o', p, observed=model_output) 51 | 52 | return model 53 | 54 | def fit( 55 | self, 56 | X, 57 | y, 58 | inference_type='advi', 59 | num_advi_sample_draws=10000, 60 | minibatch_size=None, 61 | inference_args=None 62 | ): 63 | """ 64 | Train the Logistic Regression model 65 | 66 | Parameters 67 | ---------- 68 | X : numpy array 69 | shape [num_training_samples, num_pred] 70 | 71 | y : numpy array 72 | shape [num_training_samples, ] 73 | 74 | inference_type : str (defaults to 'advi') 75 | specifies which inference method to call 76 | Currently, only 'advi' and 'nuts' are supported. 77 | 78 | num_advi_sample_draws : int (defaults to 10000) 79 | Number of samples to draw from ADVI approximation after it has been fit; 80 | not used if inference_type != 'advi' 81 | 82 | minibatch_size : int (defaults to None) 83 | number of samples to include in each minibatch for ADVI 84 | If None, minibatch is not run. 85 | 86 | inference_args : dict (defaults to None) 87 | arguments to be passed to the inference methods. 88 | Check the PyMC3 docs for permissable values. 89 | If None, default values will be set. 90 | """ 91 | self.num_training_samples, self.num_pred = X.shape 92 | 93 | self.inference_type = inference_type 94 | 95 | if y.ndim != 1: 96 | y = np.squeeze(y) 97 | 98 | if not inference_args: 99 | inference_args = self._set_default_inference_args() 100 | 101 | if self.cached_model is None: 102 | self.cached_model = self.create_model() 103 | 104 | if minibatch_size: 105 | with self.cached_model: 106 | minibatches = { 107 | self.shared_vars['model_input']: pm.Minibatch(X, batch_size=minibatch_size), 108 | self.shared_vars['model_output']: pm.Minibatch(y, batch_size=minibatch_size), 109 | } 110 | 111 | inference_args['more_replacements'] = minibatches 112 | else: 113 | self._set_shared_vars({'model_input': X, 'model_output': y}) 114 | 115 | self._inference(inference_type, inference_args, num_advi_sample_draws=num_advi_sample_draws) 116 | 117 | return self 118 | 119 | def predict_proba(self, X, return_std=False, num_ppc_samples=2000): 120 | """ 121 | Predicts probabilities of new data with a trained Logistic Regression 122 | 123 | Parameters 124 | ---------- 125 | X : numpy array 126 | shape [num_training_samples, num_pred] 127 | 128 | return_std : bool (defaults to False) 129 | Flag of whether to return standard deviations with mean probabilities 130 | 131 | num_ppc_samples : int (defaults to 2000) 132 | 'samples' parameter passed to pm.sample_ppc 133 | """ 134 | 135 | if self.trace is None: 136 | raise PyMC3ModelsError('Run fit on the model before predict.') 137 | 138 | num_samples = X.shape[0] 139 | 140 | if self.cached_model is None: 141 | self.cached_model = self.create_model() 142 | 143 | self._set_shared_vars({ 144 | 'model_input': X, 145 | 'model_output': np.zeros(num_samples, dtype='int') 146 | }) 147 | 148 | ppc = pm.sample_ppc(self.trace, model=self.cached_model, samples=num_ppc_samples) 149 | 150 | if return_std: 151 | return ppc['o'].mean(axis=0), ppc['o'].std(axis=0) 152 | else: 153 | return ppc['o'].mean(axis=0) 154 | 155 | def predict(self, X, num_ppc_samples=2000): 156 | """ 157 | Predicts labels of new data with a trained model 158 | 159 | Parameters 160 | ---------- 161 | X : numpy array 162 | shape [num_training_samples, num_pred] 163 | 164 | num_ppc_samples : int (defaults to 2000) 165 | 'samples' parameter passed to pm.sample_ppc 166 | """ 167 | ppc_mean = self.predict_proba(X, num_ppc_samples=num_ppc_samples) 168 | 169 | pred = ppc_mean > 0.5 170 | 171 | return pred 172 | 173 | def score(self, X, y, num_ppc_samples=2000): 174 | """ 175 | Scores new data with a trained model with sklearn's accuracy_score. 176 | 177 | Parameters 178 | ---------- 179 | X : numpy array 180 | shape [num_training_samples, num_pred] 181 | 182 | y : numpy array 183 | shape [num_training_samples, ] 184 | 185 | num_ppc_samples : int (defaults to 2000) 186 | 'samples' parameter passed to pm.sample_ppc 187 | """ 188 | 189 | return accuracy_score(y, self.predict(X, num_ppc_samples=num_ppc_samples)) 190 | 191 | def save(self, file_prefix): 192 | params = { 193 | 'inference_type': self.inference_type, 194 | 'num_pred': self.num_pred, 195 | 'num_training_samples': self.num_training_samples 196 | } 197 | 198 | super(LogisticRegression, self).save(file_prefix, params) 199 | 200 | def load(self, file_prefix): 201 | params = super(LogisticRegression, self).load(file_prefix, load_custom_params=True) 202 | 203 | self.inference_type = params['inference_type'] 204 | self.num_pred = params['num_pred'] 205 | self.num_training_samples = params['num_training_samples'] 206 | -------------------------------------------------------------------------------- /pymc3_models/models/NaiveBayes.py: -------------------------------------------------------------------------------- 1 | import functools as ft 2 | 3 | import numpy as np 4 | import pymc3 as pm 5 | import scipy.stats 6 | from sklearn.metrics import accuracy_score 7 | import theano 8 | 9 | from pymc3_models.exc import PyMC3ModelsError 10 | from pymc3_models.models import BayesianModel 11 | from pymc3_models.utils import normalize 12 | 13 | 14 | class GaussianNaiveBayes(BayesianModel): 15 | """ 16 | Naive Bayes classification built using PyMC3. 17 | 18 | The Gaussian Naive Bayes algorithm assumes that the random variables 19 | that describe each class and each feature are independent and distributed 20 | according to Normal distributions. 21 | 22 | Example 23 | ------- 24 | >>> import pymc3_models as pmo 25 | >>> 26 | >>> model = pmo.GaussianNaiveBayes() 27 | >>> model.fit(X,y) 28 | >>> model.predict_proba(X) 29 | >>> model.predict(X) 30 | 31 | See the documentation of the `create_model` method for details on the model 32 | itself. 33 | """ 34 | 35 | def __init__(self): 36 | super(GaussianNaiveBayes, self).__init__() 37 | 38 | def create_model(self): 39 | """ 40 | Creates and returns the PyMC3 model. 41 | 42 | We note :math:`x_{jc}` the value of the j-th element of the data vector :math:`x` 43 | conditioned on x belonging to the class :math:`c`. The Gaussian Naive Bayes 44 | algorithm models :math:`x_{jc}` as: 45 | 46 | .. math:: 47 | 48 | x_{jc} \\sim Normal(\\mu_{jc}, \\sigma_{jc}) 49 | 50 | While the probability that :math:`x` belongs to the class :math:`c` is given by the 51 | categorical distribution: 52 | 53 | .. math:: 54 | 55 | P(y=c|x_i) = Cat(\\pi_1, \\dots, \\pi_C) 56 | 57 | where :math:`\\pi_i` is the probability that a vector belongs to category :math:`i`. 58 | 59 | We assume that the :math:`\\pi_i` follow a Dirichlet distribution: 60 | 61 | .. math:: 62 | 63 | \\pi \\sim Dirichlet(\\alpha) 64 | 65 | with hyperparameter :math:`\\alpha = [1, .., 1]`. The :math:`\\mu_{jc}` 66 | are sampled from a Normal distribution centred on :math:`0` with 67 | variance :math:`100`, and the :math:`\\sigma_{jc}` are sampled from a 68 | HalfNormal distribuion of variance :math:`100`: 69 | 70 | .. math:: 71 | 72 | \\mu_{jc} \\sim Normal(0, 100) 73 | 74 | \\sigma_{jc} \\sim HalfNormal(100) 75 | 76 | Note that the Gaussian Naive Bayes model is equivalent to a Gaussian 77 | mixture with a diagonal covariance [1]. 78 | 79 | Returns 80 | ------- 81 | A PyMC3 model 82 | 83 | References 84 | ---------- 85 | .. [1] Murphy, K. P. (2012). Machine learning: a probabilistic perspective. 86 | """ 87 | 88 | # The data 89 | X = theano.shared(np.zeros((self.num_training_samples, self.num_pred))) 90 | y = theano.shared(np.zeros(self.num_training_samples, dtype=int)) 91 | 92 | self.shared_vars = { 93 | 'model_input': X, 94 | 'model_output': y 95 | } 96 | 97 | model = pm.Model() 98 | with model: 99 | # Priors 100 | alpha = np.ones(self.num_cats) 101 | pi = pm.Dirichlet('pi', alpha, shape=self.num_cats) 102 | mu = pm.Normal('mu', mu=0, sd=100, shape=(self.num_cats, self.num_pred)) 103 | sigma = pm.HalfNormal('sigma', 100, shape=(self.num_cats, self.num_pred)) 104 | 105 | # Assign classes to data points 106 | z = pm.Categorical('z', pi, shape=self.num_training_samples, observed=y) 107 | 108 | # The components are independent and normally distributed 109 | xi = pm.Normal('xi', mu=mu[z], sd=sigma[z], observed=X) 110 | 111 | return model 112 | 113 | def fit( 114 | self, 115 | X, 116 | y, 117 | inference_type='advi', 118 | num_advi_sample_draws=10000, 119 | minibatch_size=None, 120 | inference_args=None 121 | ): 122 | """ 123 | Train the Naive Bayes model. 124 | 125 | Parameters 126 | ---------- 127 | X : numpy array 128 | shape [num_training_samples, num_pred]. Contains the data points 129 | 130 | y : numpy array 131 | shape [num_training_samples,]. Contains the category of the data points 132 | 133 | inference_type : str (defaults to 'advi') 134 | specifies which inference method to call 135 | Currently, only 'advi' and 'nuts' are supported. 136 | 137 | num_advi_sample_draws : int (defaults to 10000) 138 | Number of samples to draw from ADVI approximation after it has been fit; 139 | not used if inference_type != 'advi' 140 | 141 | minibatch_size : int (defaults to None) 142 | number of samples to include in each minibatch for ADVI 143 | If None, minibatch is not run. 144 | 145 | inference_args : dict (defaults to None) 146 | arguments to be passed to the inference methods 147 | Check the PyMC3 docs for permissable values. 148 | If None, default values will be set. 149 | 150 | Returns 151 | ------- 152 | The current instance of the GaussianNaiveBayes class. 153 | """ 154 | self.num_training_samples, self.num_pred = X.shape 155 | self.num_cats = len(np.unique(y)) 156 | self.inference_type = inference_type 157 | 158 | if not inference_args: 159 | inference_args = self._set_default_inference_args() 160 | 161 | if not self.cached_model: 162 | self.cached_model = self.create_model() 163 | 164 | if minibatch_size: 165 | with self.cached_model: 166 | minibatches = { 167 | self.shared_vars['model_input']: pm.Minibatch(X, batch_size=minibatch_size), 168 | self.shared_vars['model_output']: pm.Minibatch(y, batch_size=minibatch_size), 169 | } 170 | 171 | inference_args['more_replacements'] = minibatches 172 | else: 173 | self._set_shared_vars({'model_input': X, 'model_output': y}) 174 | 175 | self._inference(inference_type, inference_args, num_advi_sample_draws=num_advi_sample_draws) 176 | 177 | return self 178 | 179 | def predict_proba(self, X): 180 | """ 181 | Predicts the probabilities that the data points belong to each category. 182 | 183 | Given a new data point :math:`\\vec{x}`, we want to estimate the probability that 184 | it belongs to a category :math:`c`. Following the notations in [1], the probability 185 | reads: 186 | 187 | .. math:: 188 | 189 | P(y=c|\\vec{x}, \\mathcal{D}) = P(y=c|\\mathcal{D}) \\prod_{j=1}^{n_{dims}} \\ 190 | P(x_j|y=c, \\mathcal{D}) 191 | 192 | We previously used the data :math:`\\mathcal{D}` to estimate the 193 | distribution of the parameters :math:`\\vec{\\mu}`, :math:`\\vec{\\pi}` 194 | and :math:`\\vec{\\sigma}`. To compute the above probability, we need 195 | to integrate over the values of these parameters: 196 | 197 | .. math:: 198 | 199 | P(y=c|\\vec{x}, \\mathcal{D}) = \\left[\\int Cat(y=c|\\vec{\\pi})P(\\vec{\\pi}|\\ 200 | \\mathcal{D})\\mathrm{d}\\vec{\\pi}\\right] 201 | \\int P(\\vec{x}|\\vec{\\mu}, \\vec{\\sigma})\\ 202 | P(\\vec{\\mu}|\\mathcal{D})\\ 203 | P(\\vec{\\sigma}|\\mathcal{D})\\ 204 | \\mathrm{d}\\vec{\\mu}\\mathrm{d}\\vec{\\sigma} 205 | 206 | Parameters 207 | ---------- 208 | X : numpy array 209 | shape [num_training_samples, num_pred]. Contains the points 210 | for which we want to predict the class 211 | 212 | Returns 213 | ------- 214 | A numpy array of shape [num_training_samples, num_cats] that contains the probabilities 215 | that each sample belong to each category. 216 | 217 | References 218 | ---------- 219 | .. [1] Murphy, K. P. (2012). Machine learning: a probabilistic perspective. 220 | """ 221 | 222 | if self.trace is None: 223 | raise PyMC3ModelsError('Run fit on the model before predict') 224 | 225 | posterior_prediction = np.array([]) 226 | for x in X: 227 | prob_per_sample = scipy.stats.norm(self.trace['mu'], self.trace['sigma']).pdf(x) 228 | prob_per_feature = [ 229 | np.sum(prob_per_sample[:, :, i], axis=0)/len(self.trace['mu']) 230 | for i in range(self.num_pred) 231 | ] 232 | prob_per_class = normalize(ft.reduce(lambda x, y: x*y, prob_per_feature)) 233 | if len(posterior_prediction) == 0: 234 | posterior_prediction = prob_per_class 235 | else: 236 | posterior_prediction = np.vstack((posterior_prediction, prob_per_class)) 237 | 238 | return posterior_prediction 239 | 240 | def predict(self, X): 241 | """ 242 | Classify new data with a trained Naive Bayes model. The output is the point 243 | estimate of the posterior predictive distribution that corresponds to the 244 | one-hot loss function. 245 | 246 | Parameters 247 | ---------- 248 | X : numpy array 249 | shape [num_training_samples, num_pred]. Contains the data 250 | to classify 251 | 252 | Returns 253 | ------- 254 | A numpy array of shape [num_training_samples,] that contains the predicted class to 255 | which the data points belong. 256 | """ 257 | proba = self.predict_proba(X) 258 | predictions = np.argmax(proba, axis=1) 259 | return predictions 260 | 261 | def score(self, X, y): 262 | """ 263 | Scores new data with a trained model with sklearn's accuracy_score. 264 | 265 | Parameters 266 | ---------- 267 | X : numpy array 268 | shape [num_training_samples, num_pred]. Contains the data points 269 | 270 | y : numpy array 271 | shape [num_training_samples,]. Contains the category of the data points 272 | 273 | Returns 274 | ------- 275 | A float representing the accuracy score of the predictions. 276 | """ 277 | 278 | return accuracy_score(y, self.predict(X)) 279 | 280 | def save(self, file_prefix): 281 | params = { 282 | 'inference_type': self.inference_type, 283 | 'num_cats': self.num_cats, 284 | 'num_pred': self.num_pred, 285 | 'num_training_samples': self.num_training_samples 286 | } 287 | super(GaussianNaiveBayes, self).save(file_prefix, params) 288 | 289 | def load(self, file_profile): 290 | params = super(GaussianNaiveBayes, self).load(file_profile, load_custom_params=True) 291 | 292 | self.inference_type = params['inference_type'] 293 | self.num_cats = params['num_cats'] 294 | self.num_pred = params['num_pred'] 295 | self.num_training_samples = params['num_training_samples'] 296 | -------------------------------------------------------------------------------- /pymc3_models/models/__init__.py: -------------------------------------------------------------------------------- 1 | import joblib 2 | import matplotlib.pyplot as plt 3 | import pymc3 as pm 4 | import seaborn as sns 5 | from sklearn.base import BaseEstimator 6 | 7 | from pymc3_models.exc import PyMC3ModelsError 8 | 9 | 10 | class BayesianModel(BaseEstimator): 11 | """ 12 | Bayesian model base class 13 | """ 14 | def __init__(self): 15 | self.cached_model = None 16 | self.inference_type = None 17 | self.num_pred = None 18 | self.shared_vars = None 19 | self.summary = None 20 | self.trace = None 21 | 22 | def create_model(self): 23 | raise NotImplementedError 24 | 25 | def _set_shared_vars(self, shared_vars): 26 | """ 27 | Sets theano shared variables for the PyMC3 model. 28 | """ 29 | for key in shared_vars.keys(): 30 | self.shared_vars[key].set_value(shared_vars[key]) 31 | 32 | def _inference(self, inference_type='advi', inference_args=None, num_advi_sample_draws=10000): 33 | """ 34 | Calls internal methods for two types of inferences. 35 | Raises an error if the inference_type is not supported. 36 | 37 | Parameters 38 | ---------- 39 | inference_type : str (defaults to 'advi') 40 | specifies which inference method to call 41 | Currently, only 'advi' and 'nuts' are supported. 42 | 43 | inference_args : dict (defaults to None) 44 | arguments to be passed to the inference methods 45 | Check the PyMC3 docs to see what is permitted. 46 | 47 | num_advi_sample_draws : int (defaults to 10000) 48 | Number of samples to draw from ADVI approximation after it has been fit; 49 | not used if inference_type != 'advi' 50 | """ 51 | if inference_type == 'advi': 52 | self._advi_inference(inference_args, num_advi_sample_draws=num_advi_sample_draws) 53 | elif inference_type == 'nuts': 54 | self._nuts_inference(inference_args) 55 | else: 56 | raise PyMC3ModelsError('{} is not a supported type of inference'.format(inference_type)) 57 | 58 | def _advi_inference(self, inference_args, num_advi_sample_draws): 59 | """ 60 | Runs variational ADVI and then samples from those results. 61 | 62 | Parameters 63 | ---------- 64 | inference_args : dict 65 | arguments to be passed to the PyMC3 fit method 66 | See PyMC3 doc for permissible values. 67 | 68 | num_advi_sample_draws : int 69 | Number of samples to draw from ADVI approximation after it has been fit 70 | """ 71 | with self.cached_model: 72 | inference = pm.ADVI() 73 | approx = pm.fit(method=inference, **inference_args) 74 | 75 | self.approx = approx 76 | self.trace = approx.sample(draws=num_advi_sample_draws) 77 | self.summary = pm.summary(self.trace) 78 | self.advi_hist = inference.hist 79 | 80 | def _nuts_inference(self, inference_args): 81 | """ 82 | Runs NUTS inference. 83 | 84 | Parameters 85 | ---------- 86 | inference_args : dict 87 | arguments to be passed to the PyMC3 sample method 88 | See PyMC3 doc for permissible values. 89 | """ 90 | with self.cached_model: 91 | step = pm.NUTS() 92 | nuts_trace = pm.sample(step=step, **inference_args) 93 | 94 | self.trace = nuts_trace 95 | self.summary = pm.summary(self.trace) 96 | 97 | def _set_default_inference_args(self): 98 | """ 99 | Set default values for inference arguments if none are provided, dependent on inference type. 100 | 101 | ADVI Default Parameters 102 | ----------------------- 103 | callbacks : list 104 | contains a parameter stopping check. 105 | 106 | n : int (defaults to 200000) 107 | number of iterations for ADVI fit 108 | 109 | NUTS Default Parameters 110 | ----------------------- 111 | draws : int (defaults to 2000) 112 | number of samples to draw 113 | """ 114 | if self.inference_type == 'advi': 115 | inference_args = { 116 | 'n': 200000, 117 | 'callbacks': [pm.callbacks.CheckParametersConvergence()] 118 | } 119 | elif self.inference_type == 'nuts': 120 | inference_args = { 121 | 'draws': 2000 122 | } 123 | else: 124 | inference_args = None 125 | 126 | return inference_args 127 | 128 | def fit(self): 129 | raise NotImplementedError 130 | 131 | def predict(self): 132 | raise NotImplementedError 133 | 134 | def score(self): 135 | raise NotImplementedError 136 | 137 | def save(self, file_prefix, custom_params=None): 138 | """ 139 | Saves the trace and custom params to files with the given file_prefix. 140 | 141 | Parameters 142 | ---------- 143 | file_prefix : str 144 | path and prefix used to identify where to save the trace for this model, 145 | e.g. given file_prefix = 'path/to/file/' 146 | This will attempt to save to 'path/to/file/trace.pickle'. 147 | 148 | custom_params : dict (defaults to None) 149 | Custom parameters to save 150 | """ 151 | fileObject = open(file_prefix + 'trace.pickle', 'wb') 152 | joblib.dump(self.trace, fileObject) 153 | fileObject.close() 154 | 155 | if custom_params: 156 | fileObject = open(file_prefix + 'params.pickle', 'wb') 157 | joblib.dump(custom_params, fileObject) 158 | fileObject.close() 159 | 160 | def load(self, file_prefix, load_custom_params=False): 161 | """ 162 | Loads a saved version of the trace, and custom param files with the given file_prefix. 163 | 164 | Parameters 165 | ---------- 166 | file_prefix : str 167 | path and prefix used to identify where to load the saved trace for this model, 168 | e.g. given file_prefix = 'path/to/file/' 169 | This will attempt to load 'path/to/file/trace.pickle'. 170 | 171 | load_custom_params : bool (defaults to False) 172 | flag to indicate whether custom parameters should be loaded 173 | 174 | Returns 175 | ---------- 176 | custom_params : Dictionary of custom parameters 177 | """ 178 | self.trace = joblib.load(file_prefix + 'trace.pickle') 179 | 180 | custom_params = None 181 | if load_custom_params: 182 | custom_params = joblib.load(file_prefix + 'params.pickle') 183 | 184 | return custom_params 185 | 186 | def plot_elbo(self): 187 | """ 188 | Plot the ELBO values after running ADVI minibatch. 189 | """ 190 | if self.inference_type != 'advi': 191 | raise PyMC3ModelsError( 192 | 'This method should only be called after calling fit with ADVI minibatch.' 193 | ) 194 | 195 | sns.set_style('white') 196 | plt.plot(-self.advi_hist) 197 | plt.ylabel('ELBO') 198 | plt.xlabel('iteration') 199 | sns.despine() 200 | -------------------------------------------------------------------------------- /pymc3_models/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def normalize(array): 5 | """ 6 | Normalize values in the array to get probabilities. 7 | 8 | Parameters 9 | ---------- 10 | array : numpy array of shape [1,] 11 | 12 | Returns 13 | ------- 14 | A normalized array 15 | """ 16 | return array/np.sum(array) 17 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | joblib>=0.11 3 | matplotlib>=2.1.1 4 | numpy>=1.13.1 5 | pandas>=0.21.1 6 | pymc3>=3.4.1 7 | pytest 8 | scikit-learn>=0.19.1 9 | scipy>=1.0.0 10 | seaborn>=0.8.1 11 | sphinx-autobuild>=0.7.1 12 | sphinx-rtd-theme>=0.2.4 13 | sphinx>=1.5. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=0.11 2 | matplotlib>=2.1.1 3 | numpy>=1.13.1 4 | numpydoc>=0.7.0 5 | pandas>=0.21.1 6 | pymc3>=3.4.1 7 | scikit-learn>=0.19.1 8 | scipy>=1.0.0 9 | seaborn>=0.8.1 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 110 3 | application-import-names = pymc3_models 4 | import-order-style = appnexus 5 | exclude = 6 | pymc3_models/__init__.py, 7 | setup.py, 8 | ignore = 9 | # https://pep257.readthedocs.io/en/latest/error_codes.html 10 | 11 | # Missing docstrings in certain cases 12 | D100 13 | D103 14 | D104 15 | D107 16 | 17 | # 1 blank line required between summary line and description 18 | D204 19 | D205 20 | 21 | # First line should not end with a period. 22 | D400 23 | 24 | # No blank lines allowed after function docstring 25 | D202 26 | 27 | # Missing blank line after last section 28 | D413 29 | 30 | # Line break before and after binary operator 31 | W503 32 | W504 33 | 34 | # Missing whitespace around arithmetic operator 35 | E226 36 | 37 | # First line should be in imperative mood 38 | D401 39 | 40 | # Local variable is assigned but never used 41 | F841 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('AUTHORS.txt') as a: 4 | # reSt-ify the authors list 5 | authors = '' 6 | for author in a.read().split('\n'): 7 | authors += '| '+author+'\n' 8 | 9 | with open('pymc3_models/_version.py') as version_file: 10 | exec(version_file.read()) 11 | 12 | with open('README.md') as r: 13 | readme = r.read() 14 | 15 | 16 | setup( 17 | name='pymc3_models', 18 | version=__version__, 19 | description='Custom PyMC3 models built on top of the scikit-learn API', 20 | long_description=readme, 21 | long_description_content_type='text/markdown', 22 | author='Nicole Carlson', 23 | author_email='nicole@parsingscience.com', 24 | url='https://github.com/parsing-science/pymc3_models', 25 | packages=find_packages(), 26 | package_data={'docs': ['*']}, 27 | include_package_data=True, 28 | zip_safe=False, 29 | install_requires=[ 30 | 'joblib', 31 | 'matplotlib', 32 | 'numpy', 33 | 'pandas>=0.19', 34 | 'pymc3>=3.3', 35 | 'scipy', 36 | 'seaborn', 37 | 'sklearn' 38 | ], 39 | classifiers=[ 40 | 'License :: OSI Approved :: Apache Software License', 41 | 'Programming Language :: Python :: 2.7', 42 | 'Programming Language :: Python :: 3.4' 43 | ] 44 | ) 45 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/tests/__init__.py -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/parsing-science/pymc3_models/1b8cca86a2ce05dfec3df81ea57d17c7defb51ed/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/test_BayesianModel.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pymc3_models.models import BayesianModel 4 | 5 | 6 | class BayesianModelTestCase(unittest.TestCase): 7 | def test_create_model_raises_not_implemented_error(self): 8 | with self.assertRaises(NotImplementedError): 9 | BM = BayesianModel() 10 | BM.create_model() 11 | 12 | def test_fit_raises_not_implemented_error(self): 13 | with self.assertRaises(NotImplementedError): 14 | BM = BayesianModel() 15 | BM.fit() 16 | 17 | def test_predict_raises_not_implemented_error(self): 18 | with self.assertRaises(NotImplementedError): 19 | BM = BayesianModel() 20 | BM.predict() 21 | 22 | def test_score_raises_not_implemented_error(self): 23 | with self.assertRaises(NotImplementedError): 24 | BM = BayesianModel() 25 | BM.score() 26 | -------------------------------------------------------------------------------- /tests/models/test_HierarchicalLogisticRegression.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pymc3 as pm 8 | from pymc3 import summary 9 | from sklearn.model_selection import train_test_split 10 | 11 | from pymc3_models.exc import PyMC3ModelsError 12 | from pymc3_models import HierarchicalLogisticRegression 13 | 14 | 15 | class HierarchicalLogisticRegressionTestCase(unittest.TestCase): 16 | def setUp(self): 17 | def numpy_invlogit(x): 18 | return 1 / (1 + np.exp(-x)) 19 | 20 | self.num_cats = 3 21 | self.num_pred = 1 22 | self.num_samples_per_cat = 100000 23 | 24 | # Set random seed for repeatability 25 | np.random.seed(27) 26 | 27 | self.alphas = np.random.randn(self.num_cats) 28 | self.betas = np.random.randn(self.num_cats, self.num_pred) 29 | # TODO: make this more efficient; right now, it's very explicit. 30 | x_a = np.random.randn(self.num_samples_per_cat, self.num_pred) 31 | y_a = np.random.binomial(1, numpy_invlogit(self.alphas[0] + np.sum(self.betas[0] * x_a, 1))) 32 | x_b = np.random.randn(self.num_samples_per_cat, self.num_pred) 33 | y_b = np.random.binomial(1, numpy_invlogit(self.alphas[1] + np.sum(self.betas[1] * x_b, 1))) 34 | x_c = np.random.randn(self.num_samples_per_cat, self.num_pred) 35 | y_c = np.random.binomial(1, numpy_invlogit(self.alphas[2] + np.sum(self.betas[2] * x_c, 1))) 36 | 37 | X = np.concatenate([x_a, x_b, x_c]) 38 | Y = np.concatenate([y_a, y_b, y_c]) 39 | cats = np.concatenate([ 40 | np.zeros(self.num_samples_per_cat, dtype=np.int), 41 | np.ones(self.num_samples_per_cat, dtype=np.int), 42 | 2*np.ones(self.num_samples_per_cat, dtype=np.int) 43 | ]) 44 | 45 | output = train_test_split(X, cats, Y, test_size=0.4) 46 | 47 | self.X_train, self.X_test, self.cat_train, self.cat_test, self.Y_train, self.Y_test = output 48 | 49 | self.test_HLR = HierarchicalLogisticRegression() 50 | # Fit the model once 51 | inference_args = { 52 | 'n': 60000, 53 | 'callbacks': [pm.callbacks.CheckParametersConvergence()] 54 | } 55 | # Note: print is here so PyMC3 output won't overwrite the test name 56 | print('') 57 | self.test_HLR.fit( 58 | self.X_train, 59 | self.Y_train, 60 | self.cat_train, 61 | num_advi_sample_draws=5000, 62 | minibatch_size=2000, 63 | inference_args=inference_args 64 | ) 65 | 66 | self.test_dir = tempfile.mkdtemp() 67 | 68 | def tearDown(self): 69 | shutil.rmtree(self.test_dir) 70 | 71 | 72 | class HierarchicalLogisticRegressionFitTestCase(HierarchicalLogisticRegressionTestCase): 73 | def test_fit_returns_correct_model(self): 74 | self.assertEqual(self.num_cats, self.test_HLR.num_cats) 75 | self.assertEqual(self.num_pred, self.test_HLR.num_pred) 76 | 77 | # TODO: Figure out best way to test 78 | # np.testing.assert_almost_equal(self.alphas, self.test_HLR.trace['alphas'].mean(), decimal=1) 79 | # np.testing.assert_almost_equal(self.betas, self.test_HLR.trace['betas'].mean(), decimal=1) 80 | 81 | # For now, just check that the estimated parameters have the correct signs 82 | np.testing.assert_equal( 83 | np.sign(self.alphas), 84 | np.sign(self.test_HLR.trace['alpha'].mean(axis=0)) 85 | ) 86 | np.testing.assert_equal( 87 | np.sign(self.betas), 88 | np.sign(self.test_HLR.trace['beta'].mean(axis=0)) 89 | ) 90 | 91 | 92 | class HierarchicalLogisticRegressionPredictProbaTestCase(HierarchicalLogisticRegressionTestCase): 93 | def test_predict_proba_returns_probabilities(self): 94 | probs = self.test_HLR.predict_proba(self.X_test, self.cat_test) 95 | self.assertEqual(probs.shape, self.Y_test.shape) 96 | 97 | def test_predict_proba_returns_probabilities_and_std(self): 98 | probs, stds = self.test_HLR.predict_proba(self.X_test, self.cat_test, return_std=True) 99 | self.assertEqual(probs.shape, self.Y_test.shape) 100 | self.assertEqual(stds.shape, self.Y_test.shape) 101 | 102 | def test_predict_proba_raises_error_if_not_fit(self): 103 | with self.assertRaises(PyMC3ModelsError) as no_fit_error: 104 | test_HLR = HierarchicalLogisticRegression() 105 | test_HLR.predict_proba(self.X_train, self.cat_train) 106 | 107 | expected = 'Run fit on the model before predict.' 108 | self.assertEqual(str(no_fit_error.exception), expected) 109 | 110 | 111 | class HierarchicalLogisticRegressionPredictTestCase(HierarchicalLogisticRegressionTestCase): 112 | def test_predict_returns_predictions(self): 113 | preds = self.test_HLR.predict(self.X_test, self.cat_test) 114 | self.assertEqual(preds.shape, self.Y_test.shape) 115 | 116 | 117 | class HierarchicalLogisticRegressionScoreTestCase(HierarchicalLogisticRegressionTestCase): 118 | def test_score_scores(self): 119 | score = self.test_HLR.score(self.X_test, self.Y_test, self.cat_test) 120 | naive_score = np.mean(self.Y_test) 121 | self.assertGreaterEqual(score, naive_score) 122 | 123 | 124 | class HierarchicalLogisticRegressionSaveandLoadTestCase(HierarchicalLogisticRegressionTestCase): 125 | def test_save_and_load_work_correctly(self): 126 | probs1 = self.test_HLR.predict_proba(self.X_test, self.cat_test) 127 | self.test_HLR.save(self.test_dir) 128 | 129 | HLR2 = HierarchicalLogisticRegression() 130 | 131 | HLR2.load(self.test_dir) 132 | 133 | self.assertEqual(self.test_HLR.num_cats, HLR2.num_cats) 134 | self.assertEqual(self.test_HLR.num_pred, HLR2.num_pred) 135 | self.assertEqual(self.test_HLR.num_training_samples, HLR2.num_training_samples) 136 | pd.testing.assert_frame_equal(summary(self.test_HLR.trace), summary(HLR2.trace)) 137 | 138 | probs2 = HLR2.predict_proba(self.X_test, self.cat_test) 139 | 140 | np.testing.assert_almost_equal(probs2, probs1, decimal=1) 141 | -------------------------------------------------------------------------------- /tests/models/test_LinearRegression.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from pymc3 import summary 8 | from sklearn.linear_model import LinearRegression as skLinearRegression 9 | from sklearn.model_selection import train_test_split 10 | 11 | from pymc3_models.exc import PyMC3ModelsError 12 | from pymc3_models import LinearRegression 13 | 14 | 15 | class LinearRegressionTestCase(unittest.TestCase): 16 | def setUp(self): 17 | self.num_pred = 1 18 | self.alpha = 3 19 | self.betas = 4 20 | self.s = 2 21 | 22 | # Set random seed for repeatability 23 | np.random.seed(27) 24 | 25 | X = np.random.randn(1000, 1) 26 | noise = self.s * np.random.randn(1000, 1) 27 | Y = self.betas * X + self.alpha + noise 28 | Y = np.squeeze(Y) 29 | 30 | self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( 31 | X, Y, test_size=0.4 32 | ) 33 | 34 | self.test_LR = LinearRegression() 35 | # Fit the model with ADVI once 36 | self.test_LR.fit(self.X_train, self.Y_train, num_advi_sample_draws=5000, minibatch_size=2000) 37 | 38 | self.nuts_LR = LinearRegression() 39 | 40 | self.test_dir = tempfile.mkdtemp() 41 | 42 | def tearDown(self): 43 | shutil.rmtree(self.test_dir) 44 | 45 | 46 | class LinearRegressionFitTestCase(LinearRegressionTestCase): 47 | def test_advi_fit_returns_correct_model(self): 48 | self.assertEqual(self.num_pred, self.test_LR.num_pred) 49 | 50 | np.testing.assert_almost_equal(self.alpha, self.test_LR.summary['mean']['alpha__0'], decimal=1) 51 | np.testing.assert_almost_equal(self.betas, self.test_LR.summary['mean']['betas__0_0'], decimal=1) 52 | np.testing.assert_almost_equal(self.s, self.test_LR.summary['mean']['s'], decimal=1) 53 | 54 | def test_nuts_fit_returns_correct_model(self): 55 | # Note: print is here so PyMC3 output won't overwrite the test name 56 | print('') 57 | self.nuts_LR.fit(self.X_train, self.Y_train, inference_type='nuts', inference_args={'draws': 2000}) 58 | 59 | self.assertEqual(self.num_pred, self.nuts_LR.num_pred) 60 | 61 | np.testing.assert_almost_equal(self.alpha, self.nuts_LR.summary['mean']['alpha__0'], decimal=1) 62 | np.testing.assert_almost_equal(self.betas, self.nuts_LR.summary['mean']['betas__0_0'], decimal=1) 63 | np.testing.assert_almost_equal(self.s, self.nuts_LR.summary['mean']['s'], decimal=1) 64 | 65 | 66 | class LinearRegressionPredictTestCase(LinearRegressionTestCase): 67 | def test_predict_returns_predictions(self): 68 | preds = self.test_LR.predict(self.X_test) 69 | self.assertEqual(preds.shape, self.Y_test.shape) 70 | 71 | def test_predict_returns_mean_predictions_and_std(self): 72 | preds, stds = self.test_LR.predict(self.X_test, return_std=True) 73 | self.assertEqual(preds.shape, self.Y_test.shape) 74 | self.assertEqual(stds.shape, self.Y_test.shape) 75 | 76 | def test_predict_raises_error_if_not_fit(self): 77 | with self.assertRaises(PyMC3ModelsError) as no_fit_error: 78 | test_LR = LinearRegression() 79 | test_LR.predict(self.X_train) 80 | 81 | expected = 'Run fit on the model before predict.' 82 | self.assertEqual(str(no_fit_error.exception), expected) 83 | 84 | 85 | class LinearRegressionScoreTestCase(LinearRegressionTestCase): 86 | def test_score_matches_sklearn_performance(self): 87 | skLR = skLinearRegression() 88 | skLR.fit(self.X_train, self.Y_train) 89 | skLR_score = skLR.score(self.X_test, self.Y_test) 90 | 91 | score = self.test_LR.score(self.X_test, self.Y_test) 92 | np.testing.assert_almost_equal(skLR_score, score, decimal=1) 93 | 94 | 95 | class LinearRegressionSaveandLoadTestCase(LinearRegressionTestCase): 96 | def test_save_and_load_work_correctly(self): 97 | score1 = self.test_LR.score(self.X_test, self.Y_test) 98 | self.test_LR.save(self.test_dir) 99 | 100 | LR2 = LinearRegression() 101 | 102 | LR2.load(self.test_dir) 103 | 104 | self.assertEqual(self.test_LR.inference_type, LR2.inference_type) 105 | self.assertEqual(self.test_LR.num_pred, LR2.num_pred) 106 | self.assertEqual(self.test_LR.num_training_samples, LR2.num_training_samples) 107 | pd.testing.assert_frame_equal(summary(self.test_LR.trace), summary(LR2.trace)) 108 | 109 | score2 = LR2.score(self.X_test, self.Y_test) 110 | 111 | np.testing.assert_almost_equal(score1, score2, decimal=1) 112 | -------------------------------------------------------------------------------- /tests/models/test_LogisticRegression.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pymc3 as pm 8 | from pymc3 import summary 9 | from sklearn.linear_model import LogisticRegression as sklearn_LR 10 | from sklearn.model_selection import train_test_split 11 | 12 | from pymc3_models.exc import PyMC3ModelsError 13 | from pymc3_models import LogisticRegression 14 | 15 | 16 | class LogisticRegressionTestCase(unittest.TestCase): 17 | def setUp(self): 18 | def numpy_invlogit(x): 19 | return 1 / (1 + np.exp(-x)) 20 | 21 | self.num_pred = 1 22 | self.num_samples = 10000 23 | 24 | # Set random seed for repeatability 25 | np.random.seed(27) 26 | 27 | self.alphas = np.random.randn(1) 28 | self.betas = np.random.randn(1, self.num_pred) 29 | X = np.random.randn(self.num_samples, self.num_pred) 30 | Y = np.random.binomial(1, numpy_invlogit(self.alphas[0] + np.sum(self.betas * X, 1))) 31 | 32 | self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, test_size=0.4) 33 | 34 | self.test_LR = LogisticRegression() 35 | # Fit the model once 36 | inference_args = { 37 | 'n': 60000, 38 | 'callbacks': [pm.callbacks.CheckParametersConvergence()] 39 | } 40 | # Note: print is here so PyMC3 output won't overwrite the test name 41 | print('') 42 | self.test_LR.fit( 43 | self.X_train, 44 | self.Y_train, 45 | num_advi_sample_draws=5000, 46 | minibatch_size=2000, 47 | inference_args=inference_args 48 | ) 49 | 50 | self.test_dir = tempfile.mkdtemp() 51 | 52 | def tearDown(self): 53 | shutil.rmtree(self.test_dir) 54 | 55 | 56 | class LogisticRegressionFitTestCase(LogisticRegressionTestCase): 57 | def test_fit_returns_correct_model(self): 58 | self.assertEqual(self.num_pred, self.test_LR.num_pred) 59 | 60 | np.testing.assert_almost_equal(self.alphas, self.test_LR.trace['alpha'].mean(), decimal=1) 61 | np.testing.assert_almost_equal(self.betas, self.test_LR.trace['betas'].mean(), decimal=1) 62 | 63 | 64 | class LogisticRegressionPredictProbaTestCase(LogisticRegressionTestCase): 65 | def test_predict_proba_returns_probabilities(self): 66 | probs = self.test_LR.predict_proba(self.X_test) 67 | self.assertEqual(probs.shape, self.Y_test.shape) 68 | 69 | def test_predict_proba_returns_probabilities_and_std(self): 70 | probs, stds = self.test_LR.predict_proba(self.X_test, return_std=True) 71 | self.assertEqual(probs.shape, self.Y_test.shape) 72 | self.assertEqual(stds.shape, self.Y_test.shape) 73 | 74 | def test_predict_proba_raises_error_if_not_fit(self): 75 | with self.assertRaises(PyMC3ModelsError) as no_fit_error: 76 | test_LR = LogisticRegression() 77 | test_LR.predict_proba(self.X_train) 78 | 79 | expected = 'Run fit on the model before predict.' 80 | self.assertEqual(str(no_fit_error.exception), expected) 81 | 82 | 83 | class LogisticRegressionPredictTestCase(LogisticRegressionTestCase): 84 | def test_predict_returns_predictions(self): 85 | preds = self.test_LR.predict(self.X_test) 86 | self.assertEqual(preds.shape, self.Y_test.shape) 87 | 88 | 89 | class LogisticRegressionScoreTestCase(LogisticRegressionTestCase): 90 | def test_score_scores(self): 91 | score = self.test_LR.score(self.X_test, self.Y_test) 92 | naive_score = np.mean(self.Y_test) 93 | self.assertGreaterEqual(score, naive_score) 94 | 95 | def test_score_matches_sklearn_performance(self): 96 | SLR = sklearn_LR() 97 | SLR.fit(self.X_train, self.Y_train) 98 | SLR_score = SLR.score(self.X_test, self.Y_test) 99 | 100 | self.test_LR.fit(self.X_train, self.Y_train) 101 | test_LR_score = self.test_LR.score(self.X_test, self.Y_test) 102 | 103 | self.assertAlmostEqual(SLR_score, test_LR_score, 1) 104 | 105 | 106 | class LogisticRegressionSaveandLoadTestCase(LogisticRegressionTestCase): 107 | def test_save_and_load_work_correctly(self): 108 | probs1 = self.test_LR.predict_proba(self.X_test) 109 | self.test_LR.save(self.test_dir) 110 | 111 | LR2 = LogisticRegression() 112 | 113 | LR2.load(self.test_dir) 114 | 115 | self.assertEqual(self.test_LR.num_pred, LR2.num_pred) 116 | self.assertEqual(self.test_LR.num_training_samples, LR2.num_training_samples) 117 | pd.testing.assert_frame_equal(summary(self.test_LR.trace), summary(LR2.trace)) 118 | 119 | probs2 = LR2.predict_proba(self.X_test) 120 | 121 | np.testing.assert_almost_equal(probs2, probs1, decimal=1) 122 | -------------------------------------------------------------------------------- /tests/models/test_NaiveBayes.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pymc3 as pm 8 | from pymc3 import summary 9 | import scipy.stats 10 | from sklearn.model_selection import train_test_split 11 | 12 | from pymc3_models import GaussianNaiveBayes 13 | from pymc3_models.exc import PyMC3ModelsError 14 | 15 | 16 | class GaussianNaiveBayesTestCase(unittest.TestCase): 17 | def setUp(self): 18 | """ 19 | Set up a test case with synthetic data. 20 | """ 21 | 22 | self.num_cats = 3 23 | self.num_pred = 10 24 | self.num_samples = 50000 25 | 26 | # Set random seed for repeatability 27 | np.random.seed(27) 28 | 29 | # Generate priors 30 | self.alpha = np.ones(self.num_cats) 31 | self.pi = np.random.dirichlet(self.alpha) 32 | self.mu = np.random.normal(0, 100, size=(self.num_cats, self.num_pred)) 33 | self.sigma = scipy.stats.halfnorm(loc=0, scale=100).rvs(size=(self.num_cats, self.num_pred)) 34 | # Generate data 35 | Y = np.random.choice(range(self.num_cats), self.num_samples, p=self.pi) 36 | x_vectors = [] 37 | for i in Y: 38 | x_vectors.append(np.random.normal(self.mu[i], self.sigma[i])) 39 | X = np.vstack(x_vectors) 40 | 41 | # Split into train/test sets 42 | self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, test_size=0.4) 43 | 44 | self.num_training_samples = self.Y_train.shape[0] 45 | 46 | self.test_GNB = GaussianNaiveBayes() 47 | # Fit the model once 48 | inference_args = { 49 | 'n': 10000, 50 | 'callbacks': [pm.callbacks.CheckParametersConvergence()] 51 | } 52 | # Note: print is here so PyMC3 output won't overwrite the test name 53 | print('') 54 | self.test_GNB.fit( 55 | self.X_train, 56 | self.Y_train, 57 | num_advi_sample_draws=1000, 58 | minibatch_size=2000, 59 | inference_args=inference_args 60 | ) 61 | 62 | self.test_dir = tempfile.mkdtemp() 63 | 64 | def tearDown(self): 65 | """ 66 | Tear down the testing environment. 67 | """ 68 | shutil.rmtree(self.test_dir) 69 | 70 | 71 | class GaussianNaiveBayesFitTestCase(GaussianNaiveBayesTestCase): 72 | def test_fit_returns_correct_model(self): 73 | """ 74 | Test the model initialization and fit. 75 | 76 | Currently, only the sign of inferred parameters is checked 77 | against the sign of the parameters used to generate the data. 78 | 79 | TOOO: Find better strategies to test probabilistic code. 80 | """ 81 | # Check that the model correctly infers dimensions 82 | self.assertEqual(self.num_cats, self.test_GNB.num_cats) 83 | self.assertEqual(self.num_training_samples, self.test_GNB.num_training_samples) 84 | self.assertEqual(self.num_pred, self.test_GNB.num_pred) 85 | 86 | # TODO: How do you write tests for a stochastic model? 87 | # TODO: Diagnose the sampling with a reasonable sampling size? 88 | np.testing.assert_equal( 89 | np.sign(self.pi), 90 | np.sign(self.test_GNB.trace['pi'].mean(axis=0)) 91 | ) 92 | np.testing.assert_equal( 93 | np.sign(self.sigma), 94 | np.sign(self.test_GNB.trace['sigma'].mean(axis=0)) 95 | ) 96 | 97 | 98 | class GaussianNaiveBayesPredictProbaTest(GaussianNaiveBayesTestCase): 99 | def test_predict_proba_returns_probabilities(self): 100 | probs = self.test_GNB.predict_proba(self.X_test) 101 | self.assertEqual(probs.shape[0], self.Y_test.shape[0]) 102 | 103 | def test_predict_proba_raises_error_if_not_fit(self): 104 | with self.assertRaises(PyMC3ModelsError) as no_fit_error: 105 | test_GNB = GaussianNaiveBayes() 106 | test_GNB.predict_proba(self.X_train) 107 | expected = 'Run fit on the model before predict' 108 | self.assertEqual(str(no_fit_error.exception), expected) 109 | 110 | 111 | class GaussianNaiveBayesPredictionTestCase(GaussianNaiveBayesTestCase): 112 | def test_predict_returns_predictions(self): 113 | """ 114 | Test that the predict() function's output has the correct shape. 115 | """ 116 | preds = self.test_GNB.predict(self.X_test) 117 | self.assertEqual(preds.shape, self.Y_test.shape) 118 | 119 | 120 | @unittest.skip('test not implemented yet') 121 | class GaussianNaiveBayesScoreTestCase(GaussianNaiveBayesTestCase): 122 | def test_score_scores(self): 123 | # TODO: Figure out how to test the score function 124 | score = self.test_GNB.score(self.X_test, self.Y_test) 125 | 126 | 127 | class GaussianNaiveBayesSaveAndLoadTestCase(GaussianNaiveBayesTestCase): 128 | def test_save_and_load_work_correctly(self): 129 | probs1 = self.test_GNB.predict_proba(self.X_test) 130 | self.test_GNB.save(self.test_dir) 131 | 132 | GNB2 = GaussianNaiveBayes() 133 | GNB2.load(self.test_dir) 134 | self.assertEqual(self.test_GNB.num_cats, GNB2.num_cats) 135 | self.assertEqual(self.test_GNB.num_pred, GNB2.num_pred) 136 | self.assertEqual(self.test_GNB.num_training_samples, GNB2.num_training_samples) 137 | pd.testing.assert_frame_equal(summary(self.test_GNB.trace), summary(GNB2.trace)) 138 | 139 | probs2 = GNB2.predict_proba(self.X_test) 140 | np.testing.assert_almost_equal(probs2, probs1, decimal=1) 141 | --------------------------------------------------------------------------------