├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── bayesian_bootstrap ├── __init__.py ├── demos │ ├── __init__.py │ ├── demos.py │ ├── group_mean_secret_weapon.py │ ├── linear_regression.py │ ├── readme_exponential.png │ └── readme_regression.png └── tests │ └── test_bootstrap.py ├── docs ├── bootstrap_documentation.html └── build.py ├── requirements.txt └── setup.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.x' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build 30 | - name: Build package 31 | run: python -m build 32 | - name: Publish package 33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 34 | with: 35 | user: __token__ 36 | password: ${{ secrets.PYPI_API_TOKEN }} 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,python,vim 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,vim 4 | 5 | ### macOS ### 6 | # General 7 | .DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Icon must end with two \r 12 | Icon 13 | 14 | # Thumbnails 15 | ._* 16 | 17 | # Files that might appear in the root of a volume 18 | .DocumentRevisions-V100 19 | .fseventsd 20 | .Spotlight-V100 21 | .TemporaryItems 22 | .Trashes 23 | .VolumeIcon.icns 24 | .com.apple.timemachine.donotpresent 25 | 26 | # Directories potentially created on remote AFP share 27 | .AppleDB 28 | .AppleDesktop 29 | Network Trash Folder 30 | Temporary Items 31 | .apdisk 32 | 33 | ### Python ### 34 | # Byte-compiled / optimized / DLL files 35 | __pycache__/ 36 | *.py[cod] 37 | *$py.class 38 | 39 | # C extensions 40 | *.so 41 | 42 | # Distribution / packaging 43 | .Python 44 | build/ 45 | develop-eggs/ 46 | dist/ 47 | downloads/ 48 | eggs/ 49 | .eggs/ 50 | lib/ 51 | lib64/ 52 | parts/ 53 | sdist/ 54 | var/ 55 | wheels/ 56 | share/python-wheels/ 57 | *.egg-info/ 58 | .installed.cfg 59 | *.egg 60 | MANIFEST 61 | 62 | # PyInstaller 63 | # Usually these files are written by a python script from a template 64 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 65 | *.manifest 66 | *.spec 67 | 68 | # Installer logs 69 | pip-log.txt 70 | pip-delete-this-directory.txt 71 | 72 | # Unit test / coverage reports 73 | htmlcov/ 74 | .tox/ 75 | .nox/ 76 | .coverage 77 | .coverage.* 78 | .cache 79 | nosetests.xml 80 | coverage.xml 81 | *.cover 82 | *.py,cover 83 | .hypothesis/ 84 | .pytest_cache/ 85 | cover/ 86 | 87 | # Translations 88 | *.mo 89 | *.pot 90 | 91 | # Django stuff: 92 | *.log 93 | local_settings.py 94 | db.sqlite3 95 | db.sqlite3-journal 96 | 97 | # Flask stuff: 98 | instance/ 99 | .webassets-cache 100 | 101 | # Scrapy stuff: 102 | .scrapy 103 | 104 | # Sphinx documentation 105 | docs/_build/ 106 | 107 | # PyBuilder 108 | .pybuilder/ 109 | target/ 110 | 111 | # Jupyter Notebook 112 | .ipynb_checkpoints 113 | 114 | # IPython 115 | profile_default/ 116 | ipython_config.py 117 | 118 | # pyenv 119 | # For a library or package, you might want to ignore these files since the code is 120 | # intended to run in multiple environments; otherwise, check them in: 121 | # .python-version 122 | 123 | # pipenv 124 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 125 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 126 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 127 | # install all needed dependencies. 128 | #Pipfile.lock 129 | 130 | # poetry 131 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 132 | # This is especially recommended for binary packages to ensure reproducibility, and is more 133 | # commonly ignored for libraries. 134 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 135 | #poetry.lock 136 | 137 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 138 | __pypackages__/ 139 | 140 | # Celery stuff 141 | celerybeat-schedule 142 | celerybeat.pid 143 | 144 | # SageMath parsed files 145 | *.sage.py 146 | 147 | # Environments 148 | .env 149 | .venv 150 | env/ 151 | venv/ 152 | ENV/ 153 | env.bak/ 154 | venv.bak/ 155 | 156 | # Spyder project settings 157 | .spyderproject 158 | .spyproject 159 | 160 | # Rope project settings 161 | .ropeproject 162 | 163 | # mkdocs documentation 164 | /site 165 | 166 | # mypy 167 | .mypy_cache/ 168 | .dmypy.json 169 | dmypy.json 170 | 171 | # Pyre type checker 172 | .pyre/ 173 | 174 | # pytype static type analyzer 175 | .pytype/ 176 | 177 | # Cython debug symbols 178 | cython_debug/ 179 | 180 | # PyCharm 181 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 182 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 183 | # and can be added to the global gitignore or merged into this file. For a more nuclear 184 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 185 | #.idea/ 186 | 187 | ### Vim ### 188 | # Swap 189 | [._]*.s[a-v][a-z] 190 | !*.svg # comment out if you don't need vector files 191 | [._]*.sw[a-p] 192 | [._]s[a-rt-v][a-z] 193 | [._]ss[a-gi-z] 194 | [._]sw[a-p] 195 | 196 | # Session 197 | Session.vim 198 | Sessionx.vim 199 | 200 | # Temporary 201 | .netrwhist 202 | *~ 203 | # Auto-generated tag files 204 | tags 205 | # Persistent undo 206 | [._]*.un~ 207 | 208 | # End of https://www.toptal.com/developers/gitignore/api/macos,python,vim 209 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | addons: 8 | apt: 9 | packages: 10 | - libblas-dev 11 | - liblapack-dev 12 | - gfortran 13 | - graphviz 14 | before_install: 15 | - pip install -U pip setuptools wheel 16 | install: 17 | - travis_wait travis_retry pip install -r requirements.txt 18 | script: "nosetests bayesian_bootstrap/tests" 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Louis Cialdella 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | include setup.py 3 | include bayesian_bootstrap\__init__.py 4 | include bayesian_bootstrap\bootstrap.py 5 | include README.md 6 | include requirements.txt 7 | include bayesian_bootstrap\docs\bootstrap_documentation.html -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `bayesian_bootstrap` ![test badge](https://travis-ci.org/lmc2179/bayesian_bootstrap.svg?branch=master) [![PyPI version](https://badge.fury.io/py/bayesian_bootstrap.svg)](https://badge.fury.io/py/bayesian_bootstrap) 2 | 3 | `bayesian_bootstrap` is a package for Bayesian bootstrapping in Python. For an overview of the Bayesian bootstrap, I highly recommend reading [Rasmus Bååth's writeup](http://www.sumsar.net/blog/2015/04/the-non-parametric-bootstrap-as-a-bayesian-model/). This Python package is similar to his [R package](http://www.sumsar.net/blog/2016/02/bayesboot-an-r-package/). 4 | 5 | This README contains some examples, below. For the documentation of the package's API, see the [docs](http://htmlpreview.github.io/?https://github.com/lmc2179/bayesian_bootstrap/blob/master/docs/bootstrap_documentation.html). 6 | 7 | This package is on pypi - you can install it with `pip install bayesian_bootstrap`. 8 | 9 | # Overview of the `bayesian_bootstrap` module 10 | 11 | This module contains tools for doing approximate bayesian inference using the Bayesian Bootstrap introduced in [Rubin's _The Bayesian Bootstrap_](https://projecteuclid.org/euclid.aos/1176345338). 12 | 13 | It contains the following: 14 | 15 | * The `mean` and `var` functions, which simulate the posterior distributions of the mean and variance 16 | 17 | * The `bayesian_bootstrap` function, which simulates the posterior distribution of an arbitrary statistic 18 | 19 | * The `BayesianBootstrapBagging` class, a wrapper allowing users to generate ensembles of regressors/classifiers 20 | using Bayesian Bootstrap resampling. A base class with a scikit-learn like estimator needs to be provided. See also 21 | the `bayesian_bootstrap_regression` function. 22 | 23 | * The `central_credible_interval` and `highest_density_interval` functions, which compute credible intervals from 24 | posterior samples. 25 | 26 | For more information about the function signatures above, see the examples below or the docstrings of each function/class. 27 | 28 | One thing that's worth making clear is the interpretation of the parameters of the `bayesian_bootstrap`, `BayesianBootstrapBagging`, and `bayesian_bootstrap_regression` functions, which all do sampling within each bootstrap replication: 29 | 30 | * The number of replications is the number of times the statistic of interested will be replicated. If we think about the classical bootstrap, this is the number of times your dataset is resampled. If we think about it from a bayesian point of view, this is the number of draws from the posterior distribution. 31 | 32 | * The resample size is the size of the dataset used to calculate the statistic of interest in each replication. More is better - you'll probably want this to be at least as large as your original dataset. 33 | 34 | # Example: Estimating the mean 35 | Let's say that we observe some data points, and we wish to simulate the posterior distribution of their mean. 36 | 37 | The following code draws four data points from an exponential distribution: 38 | ``` 39 | X = np.random.exponential(7, 4) 40 | ``` 41 | Now, we are going to simulate draws from the posterior of the mean. `bayesian_bootstrap` includes a `mean` function in 42 | the `bootstrap` module that will do this for you. 43 | 44 | The code below performs the simulation and calculates the 95% highest density interval using 10,000 bootstrap replications. It also uses the wonderful 45 | `seaborn` library to visualize the histogram with a Kernel density estimate. 46 | 47 | Included for reference in the image is the same dataset used in a classical bootstrap, to illustrate the comparative 48 | smoothness of the bayesian version. 49 | ``` 50 | from bayesian_bootstrap import mean, highest_density_interval 51 | posterior_samples = mean(X, 10000) 52 | l, r = highest_density_interval(posterior_samples) 53 | 54 | plt.title('Bayesian Bootstrap of mean') 55 | sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples') 56 | plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI') 57 | ``` 58 | 59 | The above code uses the `mean` method to simulate the posterior distribution of the mean. However, it is a special 60 | (if very common) case, along with `var` - all other statistics should use the `bayesian_bootstrap` method. The 61 | following code demonstrates doing this for the posterior of the mean: 62 | 63 | ``` 64 | from bayesian_bootstrap import bayesian_bootstrap 65 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100) 66 | ``` 67 | 68 | ![Posterior](bayesian_bootstrap/demos/readme_exponential.png) 69 | 70 | # Example: Regression modelling 71 | 82 | Let's take another example - fitting a linear regression model. The following code samples a few points in the plane. 83 | The mean is y = x, and normally distributed noise is added. 84 | ``` 85 | X = np.random.normal(0, 1, 5).reshape(-1, 1) 86 | y = X.reshape(1, -1).reshape(5) + np.random.normal(0, 1, 5) 87 | ``` 88 | We build models via bootstrap resampling, creating an ensemble of models via bootstrap aggregating. A 89 | `BayesianBootstrapBagging` wrapper class is available in the library, which is a bayesian analogue to scikit-learn's 90 | `BaggingRegressor` and `BaggingClassifer` classes. 91 | ``` 92 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000) 93 | m.fit(X, y) 94 | ``` 95 | Once we've got our ensemble trained, we can make interval predictions for new inputs by calculating their HDIs under the 96 | ensemble: 97 | ``` 98 | X_plot = np.linspace(min(X), max(X)) 99 | y_predicted = m.predict(X_plot.reshape(-1, 1)) 100 | y_predicted_interval = m.predict_highest_density_interval(X_plot.reshape(-1, 1), 0.05) 101 | 102 | plt.scatter(X.reshape(1, -1), y) 103 | plt.plot(X_plot, y_predicted, label='Mean') 104 | plt.plot(X_plot, y_predicted_interval[:,0], label='95% HDI Lower bound') 105 | plt.plot(X_plot, y_predicted_interval[:,1], label='95% HDI Upper bound') 106 | plt.legend() 107 | plt.savefig('readme_regression.png', bbox_inches='tight') 108 | ``` 109 | ![Posterior](bayesian_bootstrap/demos/readme_regression.png) 110 | 111 | Users interested in accessing the base models can do so via the `base_models_` attribute of the object. 112 | 113 | # Contributions 114 | 115 | Interested in contributing? We'd love to have your help! Please keep the following in mind: 116 | 117 | * Bug fixes are welcome! Make sure you reference the issue number that is being resolved, and that all test cases in `tests` pass. 118 | 119 | * New features are welcome as well! Any new features should include docstrings and unit tests in the `tests` directory. 120 | 121 | * If you want to contribute a case study or other documentation, feel free to write up a github-flavored markdown document or ipython notebook and put it in the `examples` folder before issuing a pull request. 122 | 123 | Credit for past contributions: 124 | 125 | * [roya0045](https://github.com/roya0045) implemented the original version of the low-memory optimizations. 126 | * [JulianWgs](https://github.com/JulianWgs) implemented the Bayesian machine learning model using weight distributions instead of resampling and a weighted Pearson correlation coefficient. He also refactored the weighted mean and covariance function to accept weight matrices. 127 | * [genos](https://github.com/genos) simplified importing and updated the RNG usage to the current numpy standard. 128 | 129 | # Further reading 130 | 131 | * [_The Bayesian Bootstrap_, Rubin, 1981](https://projecteuclid.org/euclid.aos/1176345338) 132 | 133 | * [Rasmus Bååth's original writeup on the Bayesian Bootstrap](http://www.sumsar.net/blog/2015/04/the-non-parametric-bootstrap-as-a-bayesian-model/) 134 | -------------------------------------------------------------------------------- /bayesian_bootstrap/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from copy import deepcopy 3 | 4 | 5 | def mean(X, n_replications, seed=None): 6 | """Simulate the posterior distribution of the mean. 7 | 8 | Parameter X: The observed data (array like) 9 | 10 | Parameter n_replications: The number of bootstrap replications to perform (positive integer) 11 | 12 | Parameter seed: Seed for PRNG (default None) 13 | 14 | Returns: Samples from the posterior 15 | """ 16 | weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications) 17 | return np.dot(X, weights.T) 18 | 19 | 20 | def var(X, n_replications, seed=None): 21 | """Simulate the posterior distribution of the variance. 22 | 23 | Parameter X: The observed data (array like) 24 | 25 | Parameter n_replications: The number of bootstrap replications to perform (positive integer) 26 | 27 | Parameter seed: Seed for PRNG (default None) 28 | 29 | Returns: Samples from the posterior 30 | """ 31 | samples = [] 32 | weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications) 33 | for w in weights: 34 | samples.append(np.dot([x ** 2 for x in X], w) - np.dot(X, w) ** 2) 35 | return samples 36 | 37 | 38 | def covar(X, Y, n_replications, seed=None): 39 | """Simulate the posterior distribution of the covariance. 40 | 41 | Parameter X: The observed data, first variable (array like) 42 | 43 | Parameter Y: The observed data, second (array like) 44 | 45 | Parameter n_replications: The number of bootstrap replications to perform (positive integer) 46 | 47 | Parameter seed: Seed for PRNG (default None) 48 | 49 | Returns: Samples from the posterior 50 | """ 51 | samples = [] 52 | weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications) 53 | for w in weights: 54 | cv = _weighted_covariance(X, Y, w) 55 | samples.append(cv) 56 | return samples 57 | 58 | 59 | def pearsonr(X, Y, n_replications, seed=None): 60 | """ 61 | Pearson correlation coefficient and p-value for testing non-correlation. 62 | 63 | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html 64 | 65 | """ 66 | weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications) 67 | return _weighted_pearsonr(X, Y, weights) 68 | 69 | 70 | def _weighted_covariance(X, Y, w): 71 | X_mean = np.dot(X, w.T).reshape(-1, 1) 72 | Y_mean = np.dot(Y, w.T).reshape(-1, 1) 73 | # Another approach, but less efficient 74 | # np.diag(np.dot(w, (x - X_mean) * (y - Y_mean)).T) 75 | # https://stackoverflow.com/a/14759273 76 | return (w * ((X - X_mean) * (Y - Y_mean))).sum(-1) 77 | 78 | 79 | def _weighted_pearsonr(X, Y, w): 80 | """ 81 | Weighted Pearson correlation. 82 | 83 | """ 84 | return _weighted_covariance(X, Y, w) / np.sqrt(_weighted_covariance(X, X, w) * _weighted_covariance(Y, Y, w)) 85 | 86 | 87 | def _weighted_ls(X, w, y): 88 | x_rows, x_cols = X.shape 89 | w_matrix = np.array(w) * np.eye(x_rows) 90 | coef = np.dot( 91 | np.dot(np.dot(np.linalg.inv(np.dot(np.dot(X.T, w_matrix), X)), X.T), w_matrix), 92 | y, 93 | ) 94 | return coef 95 | 96 | 97 | def linear_regression(X, y, n_replications, seed=None): 98 | coef_samples = [] 99 | weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications) 100 | for w in weights: 101 | coef_samples.append(_weighted_ls(X, w, y)) 102 | return np.vstack(coef_samples) 103 | 104 | 105 | def bayesian_bootstrap(X, statistic, n_replications, resample_size, low_mem=False, seed=None): 106 | """Simulate the posterior distribution of the given statistic. 107 | 108 | Parameter X: The observed data (array like) 109 | 110 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number) 111 | 112 | Parameter n_replications: The number of bootstrap replications to perform (positive integer) 113 | 114 | Parameter resample_size: The size of the dataset in each replication 115 | 116 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use 117 | less memory, but will run slower as a result. 118 | 119 | Parameter seed: Seed for PRNG (default None) 120 | 121 | Returns: Samples from the posterior 122 | """ 123 | if isinstance(X, list): 124 | X = np.array(X) 125 | samples = [] 126 | rng = np.random.default_rng(seed) 127 | if low_mem: 128 | weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications)) 129 | else: 130 | weights = rng.dirichlet([1] * len(X), n_replications) 131 | for w in weights: 132 | sample_index = rng.choice(range(len(X)), p=w, size=resample_size) 133 | resample_X = X[sample_index] 134 | s = statistic(resample_X) 135 | samples.append(s) 136 | return samples 137 | 138 | 139 | def bayesian_bootstrap_regression(X, y, statistic, n_replications, resample_size, low_mem=False, seed=None): 140 | """Simulate the posterior distribution of a statistic that uses dependent and independent variables. 141 | 142 | Parameter X: The observed data, independent variables (matrix like) 143 | 144 | Parameter y: The observed data, dependent variable (array like) 145 | 146 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number) 147 | 148 | Parameter n_replications: The number of bootstrap replications to perform (positive integer) 149 | 150 | Parameter resample_size: The size of the dataset in each replication 151 | 152 | Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory 153 | 154 | Parameter seed: Seed for PRNG (default None) 155 | 156 | Returns: Samples from the posterior 157 | """ 158 | samples = [] 159 | X_arr = np.array(X) 160 | y_arr = np.array(y) 161 | rng = np.random.default_rng(seed) 162 | if low_mem: 163 | weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications)) 164 | else: 165 | weights = rng.dirichlet([1] * len(X), n_replications) 166 | for w in weights: 167 | if resample_size is None: 168 | s = statistic(X, y, w) 169 | else: 170 | resample_i = rng.choice(range(len(X_arr)), p=w, size=resample_size) 171 | resample_X = X_arr[resample_i] 172 | resample_y = y_arr[resample_i] 173 | s = statistic(resample_X, resample_y) 174 | samples.append(s) 175 | 176 | return samples 177 | 178 | 179 | class BayesianBootstrapBagging: 180 | """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor.""" 181 | 182 | def __init__(self, base_learner, n_replications, resample_size=None, low_mem=False, seed=None): 183 | """Initialize the base learners of the ensemble. 184 | 185 | Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict() 186 | method. 187 | 188 | Parameter n_replications: The number of bootstrap replications to perform (positive integer) 189 | 190 | Parameter resample_size: The size of the dataset in each replication 191 | 192 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use 193 | less memory, but will run slower as a result. 194 | 195 | Parameter seed: Seed for PRNG (default None) 196 | """ 197 | self.base_learner = base_learner 198 | self.n_replications = n_replications 199 | self.resample_size = resample_size 200 | self.memo = low_mem 201 | self.seed = seed 202 | 203 | def fit(self, X, y): 204 | """Fit the base learners of the ensemble on a dataset. 205 | 206 | Parameter X: The observed data, independent variables (matrix like) 207 | 208 | Parameter y: The observed data, dependent variable (array like) 209 | 210 | Returns: Fitted model 211 | """ 212 | if self.resample_size is None: 213 | statistic = lambda X, y, w: deepcopy(self.base_learner).fit(X, y, w) # noqa: E731 214 | else: 215 | statistic = lambda X, y: deepcopy(self.base_learner).fit(X, y) # noqa: E731 216 | self.base_models_ = bayesian_bootstrap_regression( 217 | X, y, statistic, self.n_replications, self.resample_size, low_mem=self.memo, seed=self.seed 218 | ) 219 | return self 220 | 221 | def predict(self, X): 222 | """Make average predictions for a collection of observations. 223 | 224 | Parameter X: The observed data, independent variables (matrix like) 225 | 226 | Returns: The predicted dependent variable values (array like) 227 | """ 228 | y_posterior_samples = self.predict_posterior_samples(X) 229 | return np.array([np.mean(r) for r in y_posterior_samples]) 230 | 231 | def predict_posterior_samples(self, X): 232 | """Simulate posterior samples for a collection of observations. 233 | 234 | Parameter X: The observed data, independent variables (matrix like) 235 | 236 | Returns: The simulated posterior mean (matrix like) 237 | """ 238 | # Return a X_r x self.n_replications matrix 239 | y_posterior_samples = np.zeros((len(X), self.n_replications)) 240 | for i, m in enumerate(self.base_models_): 241 | y_posterior_samples[:, i] = m.predict(X) 242 | return y_posterior_samples 243 | 244 | def predict_central_interval(self, X, alpha=0.05): 245 | """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples. 246 | 247 | Parameter X: The observed data, independent variables (matrix like) 248 | 249 | Parameter alpha: The total size of the tails (Float between 0 and 1) 250 | 251 | Returns: Left and right interval bounds for each input (matrix like) 252 | """ 253 | y_posterior_samples = self.predict_posterior_samples(X) 254 | return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples]) 255 | 256 | def predict_highest_density_interval(self, X, alpha=0.05): 257 | """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples. 258 | 259 | Parameter X: The observed data, independent variables (matrix like) 260 | 261 | Parameter alpha: The total size of the tails (Float between 0 and 1) 262 | 263 | Returns: Left and right interval bounds for each input (matrix like): 264 | """ 265 | y_posterior_samples = self.predict_posterior_samples(X) 266 | return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples]) 267 | 268 | 269 | def central_credible_interval(samples, alpha=0.05): 270 | """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples. 271 | 272 | Parameter samples: The posterior samples (array like) 273 | 274 | Parameter alpha: The total size of the tails (Float between 0 and 1) 275 | 276 | Returns: Left and right interval bounds (tuple) 277 | """ 278 | return np.quantile(samples, alpha / 2), np.quantile(samples, 1 - alpha / 2) 279 | 280 | 281 | def highest_density_interval(samples, alpha=0.05): 282 | """The highest-density interval containing a (1-alpha) fraction of the posterior samples. 283 | 284 | Parameter samples: The posterior samples (array like) 285 | 286 | Parameter alpha: The total size of the tails (Float between 0 and 1) 287 | 288 | Returns: Left and right interval bounds (tuple) 289 | """ 290 | samples_sorted = sorted(samples) 291 | window_size = int(len(samples) - round(len(samples) * alpha)) 292 | smallest_window = (None, None) 293 | smallest_window_length = float("inf") 294 | for i in range(len(samples_sorted) - window_size): 295 | window = samples_sorted[i + window_size - 1], samples_sorted[i] 296 | window_length = samples_sorted[i + window_size - 1] - samples_sorted[i] 297 | if window_length < smallest_window_length: 298 | smallest_window_length = window_length 299 | smallest_window = window 300 | return smallest_window[1], smallest_window[0] 301 | 302 | 303 | def _bootstrap_replicate(X, seed=None): 304 | random_points = sorted(np.random.default_rng(seed).uniform(0, 1, len(X) - 1)) 305 | random_points.append(1) 306 | random_points.insert(0, 0) 307 | gaps = [right - left for left, right in zip(random_points[:-1], random_points[1:])] 308 | return np.array(gaps) 309 | -------------------------------------------------------------------------------- /bayesian_bootstrap/demos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmc2179/bayesian_bootstrap/93b8cf41b0675ec24a18e554f5011cdd07de7d91/bayesian_bootstrap/demos/__init__.py -------------------------------------------------------------------------------- /bayesian_bootstrap/demos/demos.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import seaborn as sns 3 | from sklearn.linear_model import LinearRegression 4 | from sklearn.utils import resample 5 | from bayesian_bootstrap import ( 6 | mean, 7 | var, 8 | bayesian_bootstrap, 9 | bayesian_bootstrap_regression, 10 | BayesianBootstrapBagging, 11 | highest_density_interval, 12 | covar, 13 | ) 14 | from tqdm import tqdm 15 | import numpy as np 16 | 17 | 18 | def plot_mean_bootstrap(): 19 | X = [-1, 0, 1] 20 | posterior_samples = mean(X, 10000) 21 | sns.distplot(posterior_samples) 22 | classical_samples = [np.mean(resample(X)) for _ in range(10000)] 23 | sns.distplot(classical_samples) 24 | plt.show() 25 | 26 | 27 | def plot_mean_resample_bootstrap(): 28 | X = [-1, 0, 1] 29 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100) 30 | sns.distplot(posterior_samples) 31 | classical_samples = [np.mean(resample(X)) for _ in range(10000)] 32 | sns.distplot(classical_samples) 33 | plt.show() 34 | 35 | 36 | def plot_median(): 37 | X = np.random.uniform(-1, 1, 10) 38 | posterior_samples = bayesian_bootstrap(X, np.median, 10000, 100) 39 | sns.distplot(posterior_samples) 40 | classical_samples = [np.median(resample(X)) for _ in range(10000)] 41 | sns.distplot(classical_samples) 42 | plt.show() 43 | 44 | 45 | def plot_var_bootstrap(): 46 | X = np.random.uniform(-1, 1, 100) 47 | posterior_samples = var(X, 10000) 48 | sns.distplot(posterior_samples) 49 | classical_samples = [np.var(resample(X)) for _ in range(10000)] 50 | sns.distplot(classical_samples) 51 | plt.show() 52 | 53 | 54 | def plot_self_covar_bootstrap(): 55 | X = np.random.uniform(-1, 1, 100) 56 | posterior_samples = covar(X, X, 10000) 57 | sns.distplot(posterior_samples) 58 | plt.show() 59 | 60 | 61 | def plot_covar_bootstrap(): 62 | X = np.random.normal(0, 1, 100) 63 | Y = np.random.normal(0, 1, 100) 64 | posterior_samples = covar(X, Y, 10000) 65 | sns.distplot(posterior_samples) 66 | plt.show() 67 | 68 | 69 | def plot_var_resample_bootstrap(): 70 | X = np.random.uniform(-1, 1, 100) 71 | posterior_samples = bayesian_bootstrap(X, np.var, 10000, 500) 72 | sns.distplot(posterior_samples) 73 | classical_samples = [np.var(resample(X)) for _ in range(10000)] 74 | sns.distplot(classical_samples) 75 | plt.show() 76 | 77 | 78 | def plot_mean_method_comparison(): 79 | X = np.random.exponential(scale=1, size=8) 80 | classical_samples = [np.mean(resample(X)) for _ in range(10000)] 81 | posterior_samples_resample = bayesian_bootstrap(X, np.mean, 10000, 1000) 82 | posterior_samples_weighted = mean(X, 10000) 83 | sns.distplot(classical_samples) 84 | sns.distplot(posterior_samples_resample) 85 | sns.distplot(posterior_samples_weighted) 86 | plt.show() 87 | 88 | 89 | def plot_regression_bootstrap(): 90 | X = np.array([[0], [1], [2], [3]]) 91 | y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4) 92 | classical_samples = [LinearRegression().fit(*resample(X, y)).coef_ for _ in tqdm(range(10000))] 93 | posterior_samples = bayesian_bootstrap_regression( 94 | X, y, lambda X, y: LinearRegression().fit(X, y).coef_, 10000, 1000 95 | ) 96 | plt.scatter(X.reshape(-1, 1), y) 97 | plt.show() 98 | sns.distplot(classical_samples) 99 | sns.distplot(posterior_samples) 100 | plt.show() 101 | 102 | 103 | def plot_regression_wrapper_bootstrap(): 104 | X = np.array([[0], [1], [2], [3]]) 105 | y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4) 106 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000) 107 | m.fit(X, y) 108 | y_predicted = m.predict(X) 109 | y_predicted_interval = m.predict_central_interval(X, 0.05) 110 | plt.scatter(X.reshape(-1, 1), y) 111 | plt.plot(X.reshape(-1, 1), y_predicted) 112 | plt.plot(X.reshape(-1, 1), y_predicted_interval[:, 0]) 113 | plt.plot(X.reshape(-1, 1), y_predicted_interval[:, 1]) 114 | plt.show() 115 | 116 | 117 | def plot_mean_bootstrap_exponential_readme(): 118 | X = np.random.exponential(7, 4) 119 | classical_samples = [np.mean(resample(X)) for _ in range(10000)] 120 | posterior_samples = mean(X, 10000) 121 | l, r = highest_density_interval(posterior_samples) 122 | classical_l, classical_r = highest_density_interval(classical_samples) 123 | plt.subplot(2, 1, 1) 124 | plt.title("Bayesian Bootstrap of mean") 125 | sns.distplot(posterior_samples, label="Bayesian Bootstrap Samples") 126 | plt.plot([l, r], [0, 0], linewidth=5.0, marker="o", label="95% HDI") 127 | plt.xlim(-1, 18) 128 | plt.legend() 129 | plt.subplot(2, 1, 2) 130 | plt.title("Classical Bootstrap of mean") 131 | sns.distplot(classical_samples, label="Classical Bootstrap Samples") 132 | plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker="o", label="95% HDI") 133 | plt.xlim(-1, 18) 134 | plt.legend() 135 | plt.savefig("readme_exponential.png", bbox_inches="tight") 136 | 137 | 138 | def plot_regression_slope_distribution_readme(): 139 | X = np.random.normal(0, 1, 5).reshape(-1, 1) 140 | y = X.reshape(1, -1).reshape(5) + np.random.normal(0, 1, 5) 141 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000) 142 | m.fit(X, y) 143 | X_plot = np.linspace(min(X), max(X)) 144 | y_predicted = m.predict(X_plot.reshape(-1, 1)) 145 | y_predicted_interval = m.predict_highest_density_interval(X_plot.reshape(-1, 1), 0.05) 146 | plt.scatter(X.reshape(1, -1), y) 147 | plt.plot(X_plot, y_predicted, label="Mean") 148 | plt.plot(X_plot, y_predicted_interval[:, 0], label="95% HDI Lower bound") 149 | plt.plot(X_plot, y_predicted_interval[:, 1], label="95% HDI Upper bound") 150 | plt.legend() 151 | plt.savefig("readme_regression.png", bbox_inches="tight") 152 | 153 | 154 | if __name__ == "__main__": 155 | # plot_mean_bootstrap() 156 | # plot_mean_resample_bootstrap() 157 | # plot_median() 158 | # plot_var_bootstrap() 159 | # plot_self_covar_bootstrap() 160 | plot_covar_bootstrap() 161 | # plot_var_resample_bootstrap() 162 | # plot_mean_method_comparison() 163 | # plot_regression_bootstrap() 164 | # plot_regression_wrapper_bootstrap() 165 | # plot_mean_bootstrap_exponential_readme() 166 | # plot_regression_slope_distribution_readme() 167 | -------------------------------------------------------------------------------- /bayesian_bootstrap/demos/group_mean_secret_weapon.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from bayesian_bootstrap import mean, highest_density_interval 3 | from matplotlib import pyplot as plt 4 | import seaborn as sns # noqa: F401 5 | 6 | 7 | def plot_group_hdis(samples, labels, alpha, n_replications): 8 | for i, (s, l) in enumerate(zip(samples, labels)): 9 | posterior = mean(s, n_replications) 10 | l, r = highest_density_interval(posterior) 11 | plt.plot([i, i], [l, r]) 12 | plt.plot([i], [np.mean(posterior)], marker="o") 13 | plt.xticks(range(len(labels)), labels) 14 | 15 | 16 | if __name__ == "__main__": 17 | samples = [ 18 | np.random.normal(0, 1, 100), 19 | np.random.normal(0, 2, 100), 20 | np.random.normal(1, 1, 100), 21 | ] 22 | labels = ["0,1", "0,2", "1,1"] 23 | plot_group_hdis(samples, labels, 0.05, 10000) 24 | plt.show() 25 | -------------------------------------------------------------------------------- /bayesian_bootstrap/demos/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | import seaborn as sns 4 | from bayesian_bootstrap import linear_regression 5 | 6 | X = np.linspace(-5, 5, 50) 7 | y = 2 * X + np.random.normal(0, 1, 50) 8 | results = linear_regression(X.reshape(-1, 1), y, 1000) 9 | sns.distplot(results[:, 0]) 10 | plt.show() 11 | -------------------------------------------------------------------------------- /bayesian_bootstrap/demos/readme_exponential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmc2179/bayesian_bootstrap/93b8cf41b0675ec24a18e554f5011cdd07de7d91/bayesian_bootstrap/demos/readme_exponential.png -------------------------------------------------------------------------------- /bayesian_bootstrap/demos/readme_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmc2179/bayesian_bootstrap/93b8cf41b0675ec24a18e554f5011cdd07de7d91/bayesian_bootstrap/demos/readme_regression.png -------------------------------------------------------------------------------- /bayesian_bootstrap/tests/test_bootstrap.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import scipy 4 | import bayesian_bootstrap as bb 5 | from bayesian_bootstrap import ( 6 | mean, 7 | var, 8 | bayesian_bootstrap, 9 | central_credible_interval, 10 | highest_density_interval, 11 | BayesianBootstrapBagging, 12 | covar, 13 | ) 14 | from sklearn.linear_model import LinearRegression 15 | 16 | RNG = np.random.default_rng(1337) # repeatable pseudorandomness 17 | 18 | 19 | class TestMoments(unittest.TestCase): 20 | def test_mean(self): 21 | X = [-1, 0, 1] 22 | posterior_samples = mean(X, 10000) 23 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.015) 24 | self.assertAlmostEqual(len([s for s in posterior_samples if s < 0]), 5000, delta=1000) 25 | 26 | def test_variance(self): 27 | X = RNG.uniform(-1, 1, 500) 28 | posterior_samples = var(X, 10000) 29 | self.assertAlmostEqual(np.mean(posterior_samples), 1 / 3.0, delta=0.05) 30 | 31 | def test_self_covar(self): 32 | X = RNG.uniform(-1, 1, 500) 33 | posterior_samples = covar(X, X, 10000) 34 | self.assertAlmostEqual(np.mean(posterior_samples), np.var(X), delta=0.05) 35 | 36 | def test_covar(self): 37 | X = RNG.uniform(-1, 1, 500) 38 | Y = RNG.uniform(-1, 1, 500) 39 | posterior_samples = covar(X, Y, 10000) 40 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.05) 41 | 42 | def test_mean_resample(self): 43 | X = [-1, 0, 1] 44 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100, low_mem=True) 45 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.01) 46 | self.assertAlmostEqual(len([s for s in posterior_samples if s < 0]), 5000, delta=1000) 47 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100, low_mem=False) 48 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.01) 49 | self.assertAlmostEqual(len([s for s in posterior_samples if s < 0]), 5000, delta=1000) 50 | 51 | def test_var_resample(self): 52 | X = RNG.uniform(-1, 1, 500) 53 | posterior_samples = bayesian_bootstrap(X, np.var, 10000, 5000, low_mem=True) 54 | self.assertAlmostEqual(np.mean(posterior_samples), 1 / 3.0, delta=0.05) 55 | X = RNG.uniform(-1, 1, 500) 56 | posterior_samples = bayesian_bootstrap(X, np.var, 10000, 5000, low_mem=False) 57 | self.assertAlmostEqual(np.mean(posterior_samples), 1 / 3.0, delta=0.05) 58 | 59 | 60 | class TestIntervals(unittest.TestCase): 61 | def test_central_credible_interval(self): 62 | l, r = central_credible_interval(self._shuffle(range(10)), alpha=0.2) 63 | self.assertEqual(l, 0.9) 64 | self.assertEqual(r, 8.1) 65 | l, r = central_credible_interval(self._shuffle(range(10)), alpha=0.19) 66 | self.assertEqual(l, 0.855) 67 | self.assertEqual(r, 8.145) 68 | l, r = central_credible_interval(self._shuffle(range(20)), alpha=0.1) 69 | self.assertAlmostEqual(l, 0.95) 70 | self.assertEqual(r, 18.05) 71 | 72 | def test_hpdi(self): 73 | l, r = highest_density_interval(self._shuffle([0, 10, 1] + [1.1] * 7), alpha=0.2) 74 | self.assertEqual(l, 1) 75 | self.assertEqual(r, 1.1) 76 | l, r = highest_density_interval(self._shuffle([0, 10, 1.1, 1]), alpha=0.5) 77 | self.assertEqual(l, 1) 78 | self.assertEqual(r, 1.1) 79 | 80 | def _shuffle(self, x): 81 | x = list(x) 82 | RNG.shuffle(x) 83 | return x 84 | 85 | 86 | class TestRegression(unittest.TestCase): 87 | def test_parameter_estimation_resampling_low_memory(self): 88 | X = RNG.uniform(0, 4, 1000) 89 | y = X + RNG.normal(0, 1, 1000) 90 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000, low_mem=True) 91 | m.fit(X.reshape(-1, 1), y) 92 | coef_samples = [b.coef_ for b in m.base_models_] 93 | intercept_samples = [b.intercept_ for b in m.base_models_] 94 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3) 95 | l, r = central_credible_interval(coef_samples, alpha=0.05) 96 | self.assertLess(l, 1) 97 | self.assertGreater(r, 1) 98 | l, r = highest_density_interval(coef_samples, alpha=0.05) 99 | self.assertLess(l, 1) 100 | self.assertGreater(r, 1) 101 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 102 | l, r = central_credible_interval(intercept_samples, alpha=0.05) 103 | self.assertLess(l, 0) 104 | self.assertGreater(r, 0) 105 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 106 | l, r = highest_density_interval(intercept_samples, alpha=0.05) 107 | self.assertLess(l, 0) 108 | self.assertGreater(r, 0) 109 | 110 | def test_parameter_estimation_resampling(self): 111 | X = RNG.uniform(0, 4, 1000) 112 | y = X + RNG.normal(0, 1, 1000) 113 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000, low_mem=False) 114 | m.fit(X.reshape(-1, 1), y) 115 | coef_samples = [b.coef_ for b in m.base_models_] 116 | intercept_samples = [b.intercept_ for b in m.base_models_] 117 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3) 118 | l, r = central_credible_interval(coef_samples, alpha=0.05) 119 | self.assertLess(l, 1) 120 | self.assertGreater(r, 1) 121 | l, r = highest_density_interval(coef_samples, alpha=0.05) 122 | self.assertLess(l, 1) 123 | self.assertGreater(r, 1) 124 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 125 | l, r = central_credible_interval(intercept_samples, alpha=0.05) 126 | self.assertLess(l, 0) 127 | self.assertGreater(r, 0) 128 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 129 | l, r = highest_density_interval(intercept_samples, alpha=0.05) 130 | self.assertLess(l, 0) 131 | self.assertGreater(r, 0) 132 | 133 | def test_parameter_estimation_bayes(self): 134 | X = RNG.uniform(0, 4, 1000) 135 | y = X + RNG.normal(0, 1, 1000) 136 | m = BayesianBootstrapBagging(LinearRegression(), 10000, low_mem=False) 137 | m.fit(X.reshape(-1, 1), y) 138 | coef_samples = [b.coef_ for b in m.base_models_] 139 | intercept_samples = [b.intercept_ for b in m.base_models_] 140 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3) 141 | l, r = central_credible_interval(coef_samples, alpha=0.05) 142 | self.assertLess(l, 1) 143 | self.assertGreater(r, 1) 144 | l, r = highest_density_interval(coef_samples, alpha=0.05) 145 | self.assertLess(l, 1) 146 | self.assertGreater(r, 1) 147 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 148 | l, r = central_credible_interval(intercept_samples, alpha=0.05) 149 | self.assertLess(l, 0) 150 | self.assertGreater(r, 0) 151 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 152 | l, r = highest_density_interval(intercept_samples, alpha=0.05) 153 | self.assertLess(l, 0) 154 | self.assertGreater(r, 0) 155 | 156 | def test_parameter_estimation_bayes_low_memory(self): 157 | X = RNG.uniform(0, 4, 1000) 158 | y = X + RNG.normal(0, 1, 1000) 159 | m = BayesianBootstrapBagging(LinearRegression(), 10000, low_mem=True) 160 | m.fit(X.reshape(-1, 1), y) 161 | coef_samples = [b.coef_ for b in m.base_models_] 162 | intercept_samples = [b.intercept_ for b in m.base_models_] 163 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3) 164 | l, r = central_credible_interval(coef_samples, alpha=0.05) 165 | self.assertLess(l, 1) 166 | self.assertGreater(r, 1) 167 | l, r = highest_density_interval(coef_samples, alpha=0.05) 168 | self.assertLess(l, 1) 169 | self.assertGreater(r, 1) 170 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 171 | l, r = central_credible_interval(intercept_samples, alpha=0.05) 172 | self.assertLess(l, 0) 173 | self.assertGreater(r, 0) 174 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3) 175 | l, r = highest_density_interval(intercept_samples, alpha=0.05) 176 | self.assertLess(l, 0) 177 | self.assertGreater(r, 0) 178 | 179 | 180 | def test_pearsonr(): 181 | x = np.linspace(0, 5, 10) 182 | y = np.linspace(0, 5, 10) 183 | assert np.mean(bb.pearsonr(x, y, 10000)) == 1 184 | assert np.mean(bb.pearsonr(x, -y, 10000)) == -1 185 | 186 | x = [0, 1, 3, 6] 187 | y = [1, 2, 5, 7] 188 | assert np.isclose(np.mean(bb.pearsonr(x, y, 10000)), scipy.stats.pearsonr(x, y)[0], atol=0.001) 189 | 190 | x = np.linspace(-10, 10, 10000) 191 | y = np.abs(x) 192 | assert np.isclose(scipy.stats.pearsonr(x, y)[0], np.mean(bb.pearsonr(x, y, 1000)), atol=0.001) 193 | 194 | 195 | if __name__ == "__main__": 196 | unittest.main() 197 | -------------------------------------------------------------------------------- /docs/bootstrap_documentation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | bayesian_bootstrap.bootstrap API documentation 7 | 8 | 9 | 10 | 11 | 551 | 552 | 853 | 854 | 921 | 922 | 1014 | 1015 | 1029 | 1030 | 1031 | Top 1032 | 1033 |
1034 | 1035 | 1036 | 1075 | 1076 |
1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 |
1084 |

bayesian_bootstrap.bootstrap module

1085 | 1086 | 1087 | 1088 |
1089 |
import numpy as np
1090 | from copy import deepcopy
1091 | 
1092 | def mean(X, n_replications):
1093 |     """Simulate the posterior distribution of the mean.
1094 | 
1095 |     Parameter X: The observed data (array like)
1096 | 
1097 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1098 | 
1099 |     Returns: Samples from the posterior
1100 |     """
1101 |     samples = []
1102 |     weights = np.random.dirichlet([1]*len(X), n_replications)
1103 |     for w in weights:
1104 |         samples.append(np.dot(X, w))
1105 |     return samples
1106 | 
1107 | def var(X, n_replications):
1108 |     """Simulate the posterior distribution of the variance.
1109 | 
1110 |     Parameter X: The observed data (array like)
1111 | 
1112 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1113 | 
1114 |     Returns: Samples from the posterior
1115 |     """
1116 |     samples = []
1117 |     weights = np.random.dirichlet([1]*len(X), n_replications)
1118 |     for w in weights:
1119 |         samples.append(np.dot([x ** 2 for x in X], w) - np.dot(X, w) ** 2)
1120 |     return samples
1121 | 
1122 | def covar(X, Y, n_replications):
1123 |     """Simulate the posterior distribution of the covariance.
1124 | 
1125 |         Parameter X: The observed data, first variable (array like)
1126 | 
1127 |         Parameter Y: The observed data, second (array like)
1128 | 
1129 |         Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1130 | 
1131 |         Returns: Samples from the posterior
1132 |     """
1133 |     samples = []
1134 |     weights = np.random.dirichlet([1]*len(X), n_replications)
1135 |     for w in weights:
1136 |         X_mean = np.dot(X, w)
1137 |         Y_mean = np.dot(Y, w)
1138 |         samples.append(np.dot(w, (X - X_mean)*(Y - Y_mean)))
1139 |     return samples
1140 | 
1141 | def bayesian_bootstrap(X, statistic, n_replications, resample_size,low_mem=False):
1142 |     """Simulate the posterior distribution of the given statistic.
1143 | 
1144 |     Parameter X: The observed data (array like)
1145 | 
1146 |     Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1147 | 
1148 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1149 | 
1150 |     Parameter resample_size: The size of the dataset in each replication
1151 |     
1152 |     Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1153 |     less memory, but will run slower as a result.
1154 | 
1155 |     Returns: Samples from the posterior
1156 |     """
1157 |     if isinstance(X, list):
1158 |         X = np.array(X)
1159 |     samples = []
1160 |     if low_mem:
1161 |         weights = (np.random.dirichlet([1] * len(X)) for _ in range(n_replications))
1162 |     else:
1163 |         weights = np.random.dirichlet([1] * len(X), n_replications)
1164 |     for w in weights:
1165 |         sample_index = np.random.choice(range(len(X)), p=w, size=resample_size)
1166 |         resample_X = X[sample_index]
1167 |         s = statistic(resample_X)
1168 |         samples.append(s)
1169 |     return samples
1170 | 
1171 | def bayesian_bootstrap_regression(X, y, statistic, n_replications, resample_size,low_mem=False):
1172 |     """Simulate the posterior distribution of a statistic that uses dependent and independent variables.
1173 | 
1174 |     Parameter X: The observed data, independent variables (matrix like)
1175 | 
1176 |     Parameter y: The observed data, dependent variable (array like)
1177 | 
1178 |     Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1179 | 
1180 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1181 | 
1182 |     Parameter resample_size: The size of the dataset in each replication
1183 |     
1184 |     Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
1185 | 
1186 |     Returns: Samples from the posterior
1187 |     """
1188 |     samples = []
1189 |     X_arr = np.array(X)
1190 |     y_arr = np.array(y)
1191 |     if low_mem:
1192 |         weights = (np.random.dirichlet([1] * len(X)) for _ in range(n_replications))
1193 |     else:
1194 |         weights = np.random.dirichlet([1] * len(X), n_replications)
1195 |     for w in weights:
1196 |         resample_i = np.random.choice(range(len(X_arr)), p=w, size=resample_size)
1197 |         resample_X = X_arr[resample_i]
1198 |         resample_y = y_arr[resample_i]
1199 |         s = statistic(resample_X, resample_y)
1200 |         samples.append(s)
1201 | 
1202 |     return samples
1203 | 
1204 | class BayesianBootstrapBagging(object):
1205 |     """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor."""
1206 |     def __init__(self, base_learner, n_replications, resample_size, low_mem=False):
1207 |         """Initialize the base learners of the ensemble.
1208 | 
1209 |         Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1210 |         method.
1211 | 
1212 |         Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1213 | 
1214 |         Parameter resample_size: The size of the dataset in each replication
1215 |         
1216 |         Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1217 |         less memory, but will run slower as a result.
1218 |         """
1219 |         self.base_learner = base_learner
1220 |         self.n_replications = n_replications
1221 |         self.resample_size = resample_size
1222 |         self.memo = low_mem
1223 | 
1224 |     def fit(self, X, y):
1225 |         """Fit the base learners of the ensemble on a dataset.
1226 | 
1227 |         Parameter X: The observed data, independent variables (matrix like)
1228 | 
1229 |         Parameter y: The observed data, dependent variable (array like)
1230 | 
1231 |         Returns: Fitted model
1232 |         """
1233 |         self.base_models_ = bayesian_bootstrap_regression(X,
1234 |                                                           y,
1235 |                                                           lambda X, y: deepcopy(self.base_learner).fit(X, y),
1236 |                                                           self.n_replications,
1237 |                                                           self.resample_size,
1238 |                                                           low_mem=self.memo)
1239 |         return self
1240 | 
1241 |     def predict(self, X):
1242 |         """Make average predictions for a collection of observations.
1243 | 
1244 |         Parameter X: The observed data, independent variables (matrix like)
1245 | 
1246 |         Returns: The predicted dependent variable values (array like)
1247 |         """
1248 |         y_posterior_samples = self.predict_posterior_samples(X)
1249 |         return np.array([np.mean(r) for r in y_posterior_samples])
1250 | 
1251 |     def predict_posterior_samples(self, X):
1252 |         """Simulate posterior samples for a collection of observations.
1253 | 
1254 |         Parameter X: The observed data, independent variables (matrix like)
1255 | 
1256 |         Returns: The simulated posterior mean (matrix like)
1257 |         """
1258 |         # Return a X_r x self.n_replications matrix
1259 |         y_posterior_samples = np.zeros((len(X), self.n_replications))
1260 |         for i, m in enumerate(self.base_models_):
1261 |             y_posterior_samples[:,i] = m.predict(X)
1262 |         return y_posterior_samples
1263 | 
1264 |     def predict_central_interval(self, X, alpha=0.05):
1265 |         """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1266 | 
1267 |         Parameter X: The observed data, independent variables (matrix like)
1268 | 
1269 |         Parameter alpha: The total size of the tails (Float between 0 and 1)
1270 | 
1271 |         Returns: Left and right interval bounds for each input (matrix like)
1272 |         """
1273 |         y_posterior_samples = self.predict_posterior_samples(X)
1274 |         return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples])
1275 | 
1276 |     def predict_highest_density_interval(self, X, alpha=0.05):
1277 |         """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1278 | 
1279 |         Parameter X: The observed data, independent variables (matrix like)
1280 | 
1281 |         Parameter alpha: The total size of the tails (Float between 0 and 1)
1282 | 
1283 |         Returns: Left and right interval bounds for each input (matrix like):
1284 |         """
1285 |         y_posterior_samples = self.predict_posterior_samples(X)
1286 |         return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples])
1287 | 
1288 | def central_credible_interval(samples, alpha=0.05):
1289 |     """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
1290 | 
1291 |     Parameter samples: The posterior samples (array like)
1292 | 
1293 |     Parameter alpha: The total size of the tails (Float between 0 and 1)
1294 | 
1295 |     Returns: Left and right interval bounds (tuple)
1296 |     """
1297 |     tail_size = int(round(len(samples)*(alpha/2)))
1298 |     samples_sorted = sorted(samples)
1299 |     return samples_sorted[tail_size],samples_sorted[-tail_size-1]
1300 | 
1301 | def highest_density_interval(samples, alpha=0.05):
1302 |     """The highest-density interval containing a (1-alpha) fraction of the posterior samples.
1303 | 
1304 |     Parameter samples: The posterior samples (array like)
1305 | 
1306 |     Parameter alpha: The total size of the tails (Float between 0 and 1)
1307 | 
1308 |     Returns: Left and right interval bounds (tuple)
1309 |     """
1310 |     samples_sorted = sorted(samples)
1311 |     window_size = int(len(samples) - round(len(samples)*alpha))
1312 |     smallest_window = (None, None)
1313 |     smallest_window_length = float('inf')
1314 |     for i in range(len(samples_sorted) - window_size):
1315 |         window = samples_sorted[i+window_size-1], samples_sorted[i]
1316 |         window_length = samples_sorted[i+window_size-1] - samples_sorted[i]
1317 |         if window_length < smallest_window_length:
1318 |             smallest_window_length = window_length
1319 |             smallest_window = window
1320 |     return smallest_window[1], smallest_window[0]
1321 | 
1322 | def _bootstrap_replicate(X):
1323 |     random_points = [0] + sorted(np.random.uniform(0, 1, len(X) - 1)) + [1]
1324 |     gaps = [r - l for l, r in zip(random_points[:-1], random_points[1:])]
1325 |     return gaps
1326 | 
1327 | 1328 |
1329 | 1330 |
1331 | 1332 |
1333 | 1334 |

Functions

1335 | 1336 |
1337 |
1338 |

def bayesian_bootstrap(

X, statistic, n_replications, resample_size, low_mem=False)

1339 |
1340 | 1341 | 1342 | 1343 | 1344 |

Simulate the posterior distribution of the given statistic.

1345 |

Parameter X: The observed data (array like)

1346 |

Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)

1347 |

Parameter n_replications: The number of bootstrap replications to perform (positive integer)

1348 |

Parameter resample_size: The size of the dataset in each replication

1349 |

Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use 1350 | less memory, but will run slower as a result.

1351 |

Returns: Samples from the posterior

1352 |
1353 | 1354 |
1355 |
def bayesian_bootstrap(X, statistic, n_replications, resample_size,low_mem=False):
1356 |     """Simulate the posterior distribution of the given statistic.
1357 | 
1358 |     Parameter X: The observed data (array like)
1359 | 
1360 |     Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1361 | 
1362 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1363 | 
1364 |     Parameter resample_size: The size of the dataset in each replication
1365 |     
1366 |     Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1367 |     less memory, but will run slower as a result.
1368 | 
1369 |     Returns: Samples from the posterior
1370 |     """
1371 |     if isinstance(X, list):
1372 |         X = np.array(X)
1373 |     samples = []
1374 |     if low_mem:
1375 |         weights = (np.random.dirichlet([1] * len(X)) for _ in range(n_replications))
1376 |     else:
1377 |         weights = np.random.dirichlet([1] * len(X), n_replications)
1378 |     for w in weights:
1379 |         sample_index = np.random.choice(range(len(X)), p=w, size=resample_size)
1380 |         resample_X = X[sample_index]
1381 |         s = statistic(resample_X)
1382 |         samples.append(s)
1383 |     return samples
1384 | 
1385 | 1386 |
1387 |
1388 | 1389 |
1390 | 1391 | 1392 |
1393 |
1394 |

def bayesian_bootstrap_regression(

X, y, statistic, n_replications, resample_size, low_mem=False)

1395 |
1396 | 1397 | 1398 | 1399 | 1400 |

Simulate the posterior distribution of a statistic that uses dependent and independent variables.

1401 |

Parameter X: The observed data, independent variables (matrix like)

1402 |

Parameter y: The observed data, dependent variable (array like)

1403 |

Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)

1404 |

Parameter n_replications: The number of bootstrap replications to perform (positive integer)

1405 |

Parameter resample_size: The size of the dataset in each replication

1406 |

Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory

1407 |

Returns: Samples from the posterior

1408 |
1409 | 1410 |
1411 |
def bayesian_bootstrap_regression(X, y, statistic, n_replications, resample_size,low_mem=False):
1412 |     """Simulate the posterior distribution of a statistic that uses dependent and independent variables.
1413 | 
1414 |     Parameter X: The observed data, independent variables (matrix like)
1415 | 
1416 |     Parameter y: The observed data, dependent variable (array like)
1417 | 
1418 |     Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1419 | 
1420 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1421 | 
1422 |     Parameter resample_size: The size of the dataset in each replication
1423 |     
1424 |     Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
1425 | 
1426 |     Returns: Samples from the posterior
1427 |     """
1428 |     samples = []
1429 |     X_arr = np.array(X)
1430 |     y_arr = np.array(y)
1431 |     if low_mem:
1432 |         weights = (np.random.dirichlet([1] * len(X)) for _ in range(n_replications))
1433 |     else:
1434 |         weights = np.random.dirichlet([1] * len(X), n_replications)
1435 |     for w in weights:
1436 |         resample_i = np.random.choice(range(len(X_arr)), p=w, size=resample_size)
1437 |         resample_X = X_arr[resample_i]
1438 |         resample_y = y_arr[resample_i]
1439 |         s = statistic(resample_X, resample_y)
1440 |         samples.append(s)
1441 | 
1442 |     return samples
1443 | 
1444 | 1445 |
1446 |
1447 | 1448 |
1449 | 1450 | 1451 |
1452 |
1453 |

def central_credible_interval(

samples, alpha=0.05)

1454 |
1455 | 1456 | 1457 | 1458 | 1459 |

The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.

1460 |

Parameter samples: The posterior samples (array like)

1461 |

Parameter alpha: The total size of the tails (Float between 0 and 1)

1462 |

Returns: Left and right interval bounds (tuple)

1463 |
1464 | 1465 |
1466 |
def central_credible_interval(samples, alpha=0.05):
1467 |     """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
1468 | 
1469 |     Parameter samples: The posterior samples (array like)
1470 | 
1471 |     Parameter alpha: The total size of the tails (Float between 0 and 1)
1472 | 
1473 |     Returns: Left and right interval bounds (tuple)
1474 |     """
1475 |     tail_size = int(round(len(samples)*(alpha/2)))
1476 |     samples_sorted = sorted(samples)
1477 |     return samples_sorted[tail_size],samples_sorted[-tail_size-1]
1478 | 
1479 | 1480 |
1481 |
1482 | 1483 |
1484 | 1485 | 1486 |
1487 |
1488 |

def covar(

X, Y, n_replications)

1489 |
1490 | 1491 | 1492 | 1493 | 1494 |

Simulate the posterior distribution of the covariance.

1495 |

Parameter X: The observed data, first variable (array like)

1496 |

Parameter Y: The observed data, second (array like)

1497 |

Parameter n_replications: The number of bootstrap replications to perform (positive integer)

1498 |

Returns: Samples from the posterior

1499 |
1500 | 1501 |
1502 |
def covar(X, Y, n_replications):
1503 |     """Simulate the posterior distribution of the covariance.
1504 | 
1505 |         Parameter X: The observed data, first variable (array like)
1506 | 
1507 |         Parameter Y: The observed data, second (array like)
1508 | 
1509 |         Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1510 | 
1511 |         Returns: Samples from the posterior
1512 |     """
1513 |     samples = []
1514 |     weights = np.random.dirichlet([1]*len(X), n_replications)
1515 |     for w in weights:
1516 |         X_mean = np.dot(X, w)
1517 |         Y_mean = np.dot(Y, w)
1518 |         samples.append(np.dot(w, (X - X_mean)*(Y - Y_mean)))
1519 |     return samples
1520 | 
1521 | 1522 |
1523 |
1524 | 1525 |
1526 | 1527 | 1528 |
1529 |
1530 |

def highest_density_interval(

samples, alpha=0.05)

1531 |
1532 | 1533 | 1534 | 1535 | 1536 |

The highest-density interval containing a (1-alpha) fraction of the posterior samples.

1537 |

Parameter samples: The posterior samples (array like)

1538 |

Parameter alpha: The total size of the tails (Float between 0 and 1)

1539 |

Returns: Left and right interval bounds (tuple)

1540 |
1541 | 1542 |
1543 |
def highest_density_interval(samples, alpha=0.05):
1544 |     """The highest-density interval containing a (1-alpha) fraction of the posterior samples.
1545 | 
1546 |     Parameter samples: The posterior samples (array like)
1547 | 
1548 |     Parameter alpha: The total size of the tails (Float between 0 and 1)
1549 | 
1550 |     Returns: Left and right interval bounds (tuple)
1551 |     """
1552 |     samples_sorted = sorted(samples)
1553 |     window_size = int(len(samples) - round(len(samples)*alpha))
1554 |     smallest_window = (None, None)
1555 |     smallest_window_length = float('inf')
1556 |     for i in range(len(samples_sorted) - window_size):
1557 |         window = samples_sorted[i+window_size-1], samples_sorted[i]
1558 |         window_length = samples_sorted[i+window_size-1] - samples_sorted[i]
1559 |         if window_length < smallest_window_length:
1560 |             smallest_window_length = window_length
1561 |             smallest_window = window
1562 |     return smallest_window[1], smallest_window[0]
1563 | 
1564 | 1565 |
1566 |
1567 | 1568 |
1569 | 1570 | 1571 |
1572 |
1573 |

def mean(

X, n_replications)

1574 |
1575 | 1576 | 1577 | 1578 | 1579 |

Simulate the posterior distribution of the mean.

1580 |

Parameter X: The observed data (array like)

1581 |

Parameter n_replications: The number of bootstrap replications to perform (positive integer)

1582 |

Returns: Samples from the posterior

1583 |
1584 | 1585 |
1586 |
def mean(X, n_replications):
1587 |     """Simulate the posterior distribution of the mean.
1588 | 
1589 |     Parameter X: The observed data (array like)
1590 | 
1591 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1592 | 
1593 |     Returns: Samples from the posterior
1594 |     """
1595 |     samples = []
1596 |     weights = np.random.dirichlet([1]*len(X), n_replications)
1597 |     for w in weights:
1598 |         samples.append(np.dot(X, w))
1599 |     return samples
1600 | 
1601 | 1602 |
1603 |
1604 | 1605 |
1606 | 1607 | 1608 |
1609 |
1610 |

def var(

X, n_replications)

1611 |
1612 | 1613 | 1614 | 1615 | 1616 |

Simulate the posterior distribution of the variance.

1617 |

Parameter X: The observed data (array like)

1618 |

Parameter n_replications: The number of bootstrap replications to perform (positive integer)

1619 |

Returns: Samples from the posterior

1620 |
1621 | 1622 |
1623 |
def var(X, n_replications):
1624 |     """Simulate the posterior distribution of the variance.
1625 | 
1626 |     Parameter X: The observed data (array like)
1627 | 
1628 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1629 | 
1630 |     Returns: Samples from the posterior
1631 |     """
1632 |     samples = []
1633 |     weights = np.random.dirichlet([1]*len(X), n_replications)
1634 |     for w in weights:
1635 |         samples.append(np.dot([x ** 2 for x in X], w) - np.dot(X, w) ** 2)
1636 |     return samples
1637 | 
1638 | 1639 |
1640 |
1641 | 1642 |
1643 | 1644 | 1645 |

Classes

1646 | 1647 |
1648 |

class BayesianBootstrapBagging

1649 | 1650 | 1651 |

A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor.

1652 |
1653 | 1654 |
1655 |
class BayesianBootstrapBagging(object):
1656 |     """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor."""
1657 |     def __init__(self, base_learner, n_replications, resample_size, low_mem=False):
1658 |         """Initialize the base learners of the ensemble.
1659 | 
1660 |         Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1661 |         method.
1662 | 
1663 |         Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1664 | 
1665 |         Parameter resample_size: The size of the dataset in each replication
1666 |         
1667 |         Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1668 |         less memory, but will run slower as a result.
1669 |         """
1670 |         self.base_learner = base_learner
1671 |         self.n_replications = n_replications
1672 |         self.resample_size = resample_size
1673 |         self.memo = low_mem
1674 | 
1675 |     def fit(self, X, y):
1676 |         """Fit the base learners of the ensemble on a dataset.
1677 | 
1678 |         Parameter X: The observed data, independent variables (matrix like)
1679 | 
1680 |         Parameter y: The observed data, dependent variable (array like)
1681 | 
1682 |         Returns: Fitted model
1683 |         """
1684 |         self.base_models_ = bayesian_bootstrap_regression(X,
1685 |                                                           y,
1686 |                                                           lambda X, y: deepcopy(self.base_learner).fit(X, y),
1687 |                                                           self.n_replications,
1688 |                                                           self.resample_size,
1689 |                                                           low_mem=self.memo)
1690 |         return self
1691 | 
1692 |     def predict(self, X):
1693 |         """Make average predictions for a collection of observations.
1694 | 
1695 |         Parameter X: The observed data, independent variables (matrix like)
1696 | 
1697 |         Returns: The predicted dependent variable values (array like)
1698 |         """
1699 |         y_posterior_samples = self.predict_posterior_samples(X)
1700 |         return np.array([np.mean(r) for r in y_posterior_samples])
1701 | 
1702 |     def predict_posterior_samples(self, X):
1703 |         """Simulate posterior samples for a collection of observations.
1704 | 
1705 |         Parameter X: The observed data, independent variables (matrix like)
1706 | 
1707 |         Returns: The simulated posterior mean (matrix like)
1708 |         """
1709 |         # Return a X_r x self.n_replications matrix
1710 |         y_posterior_samples = np.zeros((len(X), self.n_replications))
1711 |         for i, m in enumerate(self.base_models_):
1712 |             y_posterior_samples[:,i] = m.predict(X)
1713 |         return y_posterior_samples
1714 | 
1715 |     def predict_central_interval(self, X, alpha=0.05):
1716 |         """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1717 | 
1718 |         Parameter X: The observed data, independent variables (matrix like)
1719 | 
1720 |         Parameter alpha: The total size of the tails (Float between 0 and 1)
1721 | 
1722 |         Returns: Left and right interval bounds for each input (matrix like)
1723 |         """
1724 |         y_posterior_samples = self.predict_posterior_samples(X)
1725 |         return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples])
1726 | 
1727 |     def predict_highest_density_interval(self, X, alpha=0.05):
1728 |         """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1729 | 
1730 |         Parameter X: The observed data, independent variables (matrix like)
1731 | 
1732 |         Parameter alpha: The total size of the tails (Float between 0 and 1)
1733 | 
1734 |         Returns: Left and right interval bounds for each input (matrix like):
1735 |         """
1736 |         y_posterior_samples = self.predict_posterior_samples(X)
1737 |         return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples])
1738 | 
1739 | 1740 |
1741 |
1742 | 1743 | 1744 |
1745 |

Ancestors (in MRO)

1746 | 1750 |

Static methods

1751 | 1752 |
1753 |
1754 |

def __init__(

self, base_learner, n_replications, resample_size, low_mem=False)

1755 |
1756 | 1757 | 1758 | 1759 | 1760 |

Initialize the base learners of the ensemble.

1761 |

Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict() 1762 | method.

1763 |

Parameter n_replications: The number of bootstrap replications to perform (positive integer)

1764 |

Parameter resample_size: The size of the dataset in each replication

1765 |

Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use 1766 | less memory, but will run slower as a result.

1767 |
1768 | 1769 |
1770 |
def __init__(self, base_learner, n_replications, resample_size, low_mem=False):
1771 |     """Initialize the base learners of the ensemble.
1772 |     Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1773 |     method.
1774 |     Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1775 |     Parameter resample_size: The size of the dataset in each replication
1776 |     
1777 |     Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1778 |     less memory, but will run slower as a result.
1779 |     """
1780 |     self.base_learner = base_learner
1781 |     self.n_replications = n_replications
1782 |     self.resample_size = resample_size
1783 |     self.memo = low_mem
1784 | 
1785 | 1786 |
1787 |
1788 | 1789 |
1790 | 1791 | 1792 |
1793 |
1794 |

def fit(

self, X, y)

1795 |
1796 | 1797 | 1798 | 1799 | 1800 |

Fit the base learners of the ensemble on a dataset.

1801 |

Parameter X: The observed data, independent variables (matrix like)

1802 |

Parameter y: The observed data, dependent variable (array like)

1803 |

Returns: Fitted model

1804 |
1805 | 1806 |
1807 |
def fit(self, X, y):
1808 |     """Fit the base learners of the ensemble on a dataset.
1809 |     Parameter X: The observed data, independent variables (matrix like)
1810 |     Parameter y: The observed data, dependent variable (array like)
1811 |     Returns: Fitted model
1812 |     """
1813 |     self.base_models_ = bayesian_bootstrap_regression(X,
1814 |                                                       y,
1815 |                                                       lambda X, y: deepcopy(self.base_learner).fit(X, y),
1816 |                                                       self.n_replications,
1817 |                                                       self.resample_size,
1818 |                                                       low_mem=self.memo)
1819 |     return self
1820 | 
1821 | 1822 |
1823 |
1824 | 1825 |
1826 | 1827 | 1828 |
1829 |
1830 |

def predict(

self, X)

1831 |
1832 | 1833 | 1834 | 1835 | 1836 |

Make average predictions for a collection of observations.

1837 |

Parameter X: The observed data, independent variables (matrix like)

1838 |

Returns: The predicted dependent variable values (array like)

1839 |
1840 | 1841 |
1842 |
def predict(self, X):
1843 |     """Make average predictions for a collection of observations.
1844 |     Parameter X: The observed data, independent variables (matrix like)
1845 |     Returns: The predicted dependent variable values (array like)
1846 |     """
1847 |     y_posterior_samples = self.predict_posterior_samples(X)
1848 |     return np.array([np.mean(r) for r in y_posterior_samples])
1849 | 
1850 | 1851 |
1852 |
1853 | 1854 |
1855 | 1856 | 1857 |
1858 |
1859 |

def predict_central_interval(

self, X, alpha=0.05)

1860 |
1861 | 1862 | 1863 | 1864 | 1865 |

The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.

1866 |

Parameter X: The observed data, independent variables (matrix like)

1867 |

Parameter alpha: The total size of the tails (Float between 0 and 1)

1868 |

Returns: Left and right interval bounds for each input (matrix like)

1869 |
1870 | 1871 |
1872 |
def predict_central_interval(self, X, alpha=0.05):
1873 |     """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1874 |     Parameter X: The observed data, independent variables (matrix like)
1875 |     Parameter alpha: The total size of the tails (Float between 0 and 1)
1876 |     Returns: Left and right interval bounds for each input (matrix like)
1877 |     """
1878 |     y_posterior_samples = self.predict_posterior_samples(X)
1879 |     return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples])
1880 | 
1881 | 1882 |
1883 |
1884 | 1885 |
1886 | 1887 | 1888 |
1889 |
1890 |

def predict_highest_density_interval(

self, X, alpha=0.05)

1891 |
1892 | 1893 | 1894 | 1895 | 1896 |

The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.

1897 |

Parameter X: The observed data, independent variables (matrix like)

1898 |

Parameter alpha: The total size of the tails (Float between 0 and 1)

1899 |

Returns: Left and right interval bounds for each input (matrix like):

1900 |
1901 | 1902 |
1903 |
def predict_highest_density_interval(self, X, alpha=0.05):
1904 |     """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1905 |     Parameter X: The observed data, independent variables (matrix like)
1906 |     Parameter alpha: The total size of the tails (Float between 0 and 1)
1907 |     Returns: Left and right interval bounds for each input (matrix like):
1908 |     """
1909 |     y_posterior_samples = self.predict_posterior_samples(X)
1910 |     return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples])
1911 | 
1912 | 1913 |
1914 |
1915 | 1916 |
1917 | 1918 | 1919 |
1920 |
1921 |

def predict_posterior_samples(

self, X)

1922 |
1923 | 1924 | 1925 | 1926 | 1927 |

Simulate posterior samples for a collection of observations.

1928 |

Parameter X: The observed data, independent variables (matrix like)

1929 |

Returns: The simulated posterior mean (matrix like)

1930 |
1931 | 1932 |
1933 |
def predict_posterior_samples(self, X):
1934 |     """Simulate posterior samples for a collection of observations.
1935 |     Parameter X: The observed data, independent variables (matrix like)
1936 |     Returns: The simulated posterior mean (matrix like)
1937 |     """
1938 |     # Return a X_r x self.n_replications matrix
1939 |     y_posterior_samples = np.zeros((len(X), self.n_replications))
1940 |     for i, m in enumerate(self.base_models_):
1941 |         y_posterior_samples[:,i] = m.predict(X)
1942 |     return y_posterior_samples
1943 | 
1944 | 1945 |
1946 |
1947 | 1948 |
1949 | 1950 |

Instance variables

1951 |
1952 |

var base_learner

1953 | 1954 | 1955 | 1956 | 1957 |
1958 |
1959 | 1960 |
1961 |
1962 |

var memo

1963 | 1964 | 1965 | 1966 | 1967 |
1968 |
1969 | 1970 |
1971 |
1972 |

var n_replications

1973 | 1974 | 1975 | 1976 | 1977 |
1978 |
1979 | 1980 |
1981 |
1982 |

var resample_size

1983 | 1984 | 1985 | 1986 | 1987 |
1988 |
1989 | 1990 |
1991 |
1992 |
1993 | 1994 |
1995 | 1996 |
1997 |
1998 | 2009 |
2010 | 2011 | -------------------------------------------------------------------------------- /docs/build.py: -------------------------------------------------------------------------------- 1 | import pdoc 2 | s = pdoc.html('bayesian_bootstrap.bootstrap') 3 | with open('bootstrap_documentation.html', 'w') as f: 4 | f.write(s) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.22.1 2 | scipy>=1.7.3 3 | scikit-learn>=1.0.2 4 | tqdm>=4.62.3 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | with open("./requirements.txt") as f: 4 | REQUIRES = [line.strip() for line in f] 5 | 6 | setup( 7 | name = "bayesian_bootstrap", 8 | packages = ["bayesian_bootstrap"], 9 | version = "1.1.0", 10 | description = "Bayesian Bootstrapping for statistics and regression models", 11 | author = "Louis Cialdella", 12 | author_email = "louiscialdella@gmail.com", 13 | url = "https://github.com/lmc2179/bayesian_bootstrap", 14 | download_url = "https://github.com/lmc2179/bayesian_bootstrap/archive/master.zip", 15 | keywords = ["statistics", "bayesian", "machine learning", "bootstrap", "bayes", "probability", "inference"], 16 | install_requires=REQUIRES, 17 | classifiers = [ 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3", 20 | "Intended Audience :: Developers", 21 | "Intended Audience :: Science/Research", 22 | "Topic :: Software Development :: Libraries :: Python Modules", 23 | "Topic :: Software Development :: Libraries :: Python Modules", 24 | "Topic :: Scientific/Engineering", 25 | "Operating System :: OS Independent", 26 | "Topic :: Scientific/Engineering :: Mathematics", 27 | "Topic :: Scientific/Engineering :: Artificial Intelligence" 28 | ], 29 | long_description = """bayesian_bootstrap is a package for Bayesian bootstrapping in Python. For more information about this package and its usage, visit https://github.com/lmc2179/bayesian_bootstrap.""" 30 | ) 31 | --------------------------------------------------------------------------------