├── .github
└── workflows
│ └── python-publish.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── bayesian_bootstrap
├── __init__.py
├── demos
│ ├── __init__.py
│ ├── demos.py
│ ├── group_mean_secret_weapon.py
│ ├── linear_regression.py
│ ├── readme_exponential.png
│ └── readme_regression.png
└── tests
│ └── test_bootstrap.py
├── docs
├── bootstrap_documentation.html
└── build.py
├── requirements.txt
└── setup.py
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | jobs:
16 | deploy:
17 |
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: '3.x'
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install build
30 | - name: Build package
31 | run: python -m build
32 | - name: Publish package
33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 | with:
35 | user: __token__
36 | password: ${{ secrets.PYPI_API_TOKEN }}
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,python,vim
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,vim
4 |
5 | ### macOS ###
6 | # General
7 | .DS_Store
8 | .AppleDouble
9 | .LSOverride
10 |
11 | # Icon must end with two \r
12 | Icon
13 |
14 | # Thumbnails
15 | ._*
16 |
17 | # Files that might appear in the root of a volume
18 | .DocumentRevisions-V100
19 | .fseventsd
20 | .Spotlight-V100
21 | .TemporaryItems
22 | .Trashes
23 | .VolumeIcon.icns
24 | .com.apple.timemachine.donotpresent
25 |
26 | # Directories potentially created on remote AFP share
27 | .AppleDB
28 | .AppleDesktop
29 | Network Trash Folder
30 | Temporary Items
31 | .apdisk
32 |
33 | ### Python ###
34 | # Byte-compiled / optimized / DLL files
35 | __pycache__/
36 | *.py[cod]
37 | *$py.class
38 |
39 | # C extensions
40 | *.so
41 |
42 | # Distribution / packaging
43 | .Python
44 | build/
45 | develop-eggs/
46 | dist/
47 | downloads/
48 | eggs/
49 | .eggs/
50 | lib/
51 | lib64/
52 | parts/
53 | sdist/
54 | var/
55 | wheels/
56 | share/python-wheels/
57 | *.egg-info/
58 | .installed.cfg
59 | *.egg
60 | MANIFEST
61 |
62 | # PyInstaller
63 | # Usually these files are written by a python script from a template
64 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
65 | *.manifest
66 | *.spec
67 |
68 | # Installer logs
69 | pip-log.txt
70 | pip-delete-this-directory.txt
71 |
72 | # Unit test / coverage reports
73 | htmlcov/
74 | .tox/
75 | .nox/
76 | .coverage
77 | .coverage.*
78 | .cache
79 | nosetests.xml
80 | coverage.xml
81 | *.cover
82 | *.py,cover
83 | .hypothesis/
84 | .pytest_cache/
85 | cover/
86 |
87 | # Translations
88 | *.mo
89 | *.pot
90 |
91 | # Django stuff:
92 | *.log
93 | local_settings.py
94 | db.sqlite3
95 | db.sqlite3-journal
96 |
97 | # Flask stuff:
98 | instance/
99 | .webassets-cache
100 |
101 | # Scrapy stuff:
102 | .scrapy
103 |
104 | # Sphinx documentation
105 | docs/_build/
106 |
107 | # PyBuilder
108 | .pybuilder/
109 | target/
110 |
111 | # Jupyter Notebook
112 | .ipynb_checkpoints
113 |
114 | # IPython
115 | profile_default/
116 | ipython_config.py
117 |
118 | # pyenv
119 | # For a library or package, you might want to ignore these files since the code is
120 | # intended to run in multiple environments; otherwise, check them in:
121 | # .python-version
122 |
123 | # pipenv
124 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
125 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
126 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
127 | # install all needed dependencies.
128 | #Pipfile.lock
129 |
130 | # poetry
131 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
132 | # This is especially recommended for binary packages to ensure reproducibility, and is more
133 | # commonly ignored for libraries.
134 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
135 | #poetry.lock
136 |
137 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
138 | __pypackages__/
139 |
140 | # Celery stuff
141 | celerybeat-schedule
142 | celerybeat.pid
143 |
144 | # SageMath parsed files
145 | *.sage.py
146 |
147 | # Environments
148 | .env
149 | .venv
150 | env/
151 | venv/
152 | ENV/
153 | env.bak/
154 | venv.bak/
155 |
156 | # Spyder project settings
157 | .spyderproject
158 | .spyproject
159 |
160 | # Rope project settings
161 | .ropeproject
162 |
163 | # mkdocs documentation
164 | /site
165 |
166 | # mypy
167 | .mypy_cache/
168 | .dmypy.json
169 | dmypy.json
170 |
171 | # Pyre type checker
172 | .pyre/
173 |
174 | # pytype static type analyzer
175 | .pytype/
176 |
177 | # Cython debug symbols
178 | cython_debug/
179 |
180 | # PyCharm
181 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
182 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
183 | # and can be added to the global gitignore or merged into this file. For a more nuclear
184 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
185 | #.idea/
186 |
187 | ### Vim ###
188 | # Swap
189 | [._]*.s[a-v][a-z]
190 | !*.svg # comment out if you don't need vector files
191 | [._]*.sw[a-p]
192 | [._]s[a-rt-v][a-z]
193 | [._]ss[a-gi-z]
194 | [._]sw[a-p]
195 |
196 | # Session
197 | Session.vim
198 | Sessionx.vim
199 |
200 | # Temporary
201 | .netrwhist
202 | *~
203 | # Auto-generated tag files
204 | tags
205 | # Persistent undo
206 | [._]*.un~
207 |
208 | # End of https://www.toptal.com/developers/gitignore/api/macos,python,vim
209 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.5"
4 | - "3.6"
5 | - "3.7"
6 | - "3.8"
7 | addons:
8 | apt:
9 | packages:
10 | - libblas-dev
11 | - liblapack-dev
12 | - gfortran
13 | - graphviz
14 | before_install:
15 | - pip install -U pip setuptools wheel
16 | install:
17 | - travis_wait travis_retry pip install -r requirements.txt
18 | script: "nosetests bayesian_bootstrap/tests"
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Louis Cialdella
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | include setup.py
3 | include bayesian_bootstrap\__init__.py
4 | include bayesian_bootstrap\bootstrap.py
5 | include README.md
6 | include requirements.txt
7 | include bayesian_bootstrap\docs\bootstrap_documentation.html
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # `bayesian_bootstrap`  [](https://badge.fury.io/py/bayesian_bootstrap)
2 |
3 | `bayesian_bootstrap` is a package for Bayesian bootstrapping in Python. For an overview of the Bayesian bootstrap, I highly recommend reading [Rasmus Bååth's writeup](http://www.sumsar.net/blog/2015/04/the-non-parametric-bootstrap-as-a-bayesian-model/). This Python package is similar to his [R package](http://www.sumsar.net/blog/2016/02/bayesboot-an-r-package/).
4 |
5 | This README contains some examples, below. For the documentation of the package's API, see the [docs](http://htmlpreview.github.io/?https://github.com/lmc2179/bayesian_bootstrap/blob/master/docs/bootstrap_documentation.html).
6 |
7 | This package is on pypi - you can install it with `pip install bayesian_bootstrap`.
8 |
9 | # Overview of the `bayesian_bootstrap` module
10 |
11 | This module contains tools for doing approximate bayesian inference using the Bayesian Bootstrap introduced in [Rubin's _The Bayesian Bootstrap_](https://projecteuclid.org/euclid.aos/1176345338).
12 |
13 | It contains the following:
14 |
15 | * The `mean` and `var` functions, which simulate the posterior distributions of the mean and variance
16 |
17 | * The `bayesian_bootstrap` function, which simulates the posterior distribution of an arbitrary statistic
18 |
19 | * The `BayesianBootstrapBagging` class, a wrapper allowing users to generate ensembles of regressors/classifiers
20 | using Bayesian Bootstrap resampling. A base class with a scikit-learn like estimator needs to be provided. See also
21 | the `bayesian_bootstrap_regression` function.
22 |
23 | * The `central_credible_interval` and `highest_density_interval` functions, which compute credible intervals from
24 | posterior samples.
25 |
26 | For more information about the function signatures above, see the examples below or the docstrings of each function/class.
27 |
28 | One thing that's worth making clear is the interpretation of the parameters of the `bayesian_bootstrap`, `BayesianBootstrapBagging`, and `bayesian_bootstrap_regression` functions, which all do sampling within each bootstrap replication:
29 |
30 | * The number of replications is the number of times the statistic of interested will be replicated. If we think about the classical bootstrap, this is the number of times your dataset is resampled. If we think about it from a bayesian point of view, this is the number of draws from the posterior distribution.
31 |
32 | * The resample size is the size of the dataset used to calculate the statistic of interest in each replication. More is better - you'll probably want this to be at least as large as your original dataset.
33 |
34 | # Example: Estimating the mean
35 | Let's say that we observe some data points, and we wish to simulate the posterior distribution of their mean.
36 |
37 | The following code draws four data points from an exponential distribution:
38 | ```
39 | X = np.random.exponential(7, 4)
40 | ```
41 | Now, we are going to simulate draws from the posterior of the mean. `bayesian_bootstrap` includes a `mean` function in
42 | the `bootstrap` module that will do this for you.
43 |
44 | The code below performs the simulation and calculates the 95% highest density interval using 10,000 bootstrap replications. It also uses the wonderful
45 | `seaborn` library to visualize the histogram with a Kernel density estimate.
46 |
47 | Included for reference in the image is the same dataset used in a classical bootstrap, to illustrate the comparative
48 | smoothness of the bayesian version.
49 | ```
50 | from bayesian_bootstrap import mean, highest_density_interval
51 | posterior_samples = mean(X, 10000)
52 | l, r = highest_density_interval(posterior_samples)
53 |
54 | plt.title('Bayesian Bootstrap of mean')
55 | sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples')
56 | plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
57 | ```
58 |
59 | The above code uses the `mean` method to simulate the posterior distribution of the mean. However, it is a special
60 | (if very common) case, along with `var` - all other statistics should use the `bayesian_bootstrap` method. The
61 | following code demonstrates doing this for the posterior of the mean:
62 |
63 | ```
64 | from bayesian_bootstrap import bayesian_bootstrap
65 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100)
66 | ```
67 |
68 | 
69 |
70 | # Example: Regression modelling
71 |
82 | Let's take another example - fitting a linear regression model. The following code samples a few points in the plane.
83 | The mean is y = x, and normally distributed noise is added.
84 | ```
85 | X = np.random.normal(0, 1, 5).reshape(-1, 1)
86 | y = X.reshape(1, -1).reshape(5) + np.random.normal(0, 1, 5)
87 | ```
88 | We build models via bootstrap resampling, creating an ensemble of models via bootstrap aggregating. A
89 | `BayesianBootstrapBagging` wrapper class is available in the library, which is a bayesian analogue to scikit-learn's
90 | `BaggingRegressor` and `BaggingClassifer` classes.
91 | ```
92 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000)
93 | m.fit(X, y)
94 | ```
95 | Once we've got our ensemble trained, we can make interval predictions for new inputs by calculating their HDIs under the
96 | ensemble:
97 | ```
98 | X_plot = np.linspace(min(X), max(X))
99 | y_predicted = m.predict(X_plot.reshape(-1, 1))
100 | y_predicted_interval = m.predict_highest_density_interval(X_plot.reshape(-1, 1), 0.05)
101 |
102 | plt.scatter(X.reshape(1, -1), y)
103 | plt.plot(X_plot, y_predicted, label='Mean')
104 | plt.plot(X_plot, y_predicted_interval[:,0], label='95% HDI Lower bound')
105 | plt.plot(X_plot, y_predicted_interval[:,1], label='95% HDI Upper bound')
106 | plt.legend()
107 | plt.savefig('readme_regression.png', bbox_inches='tight')
108 | ```
109 | 
110 |
111 | Users interested in accessing the base models can do so via the `base_models_` attribute of the object.
112 |
113 | # Contributions
114 |
115 | Interested in contributing? We'd love to have your help! Please keep the following in mind:
116 |
117 | * Bug fixes are welcome! Make sure you reference the issue number that is being resolved, and that all test cases in `tests` pass.
118 |
119 | * New features are welcome as well! Any new features should include docstrings and unit tests in the `tests` directory.
120 |
121 | * If you want to contribute a case study or other documentation, feel free to write up a github-flavored markdown document or ipython notebook and put it in the `examples` folder before issuing a pull request.
122 |
123 | Credit for past contributions:
124 |
125 | * [roya0045](https://github.com/roya0045) implemented the original version of the low-memory optimizations.
126 | * [JulianWgs](https://github.com/JulianWgs) implemented the Bayesian machine learning model using weight distributions instead of resampling and a weighted Pearson correlation coefficient. He also refactored the weighted mean and covariance function to accept weight matrices.
127 | * [genos](https://github.com/genos) simplified importing and updated the RNG usage to the current numpy standard.
128 |
129 | # Further reading
130 |
131 | * [_The Bayesian Bootstrap_, Rubin, 1981](https://projecteuclid.org/euclid.aos/1176345338)
132 |
133 | * [Rasmus Bååth's original writeup on the Bayesian Bootstrap](http://www.sumsar.net/blog/2015/04/the-non-parametric-bootstrap-as-a-bayesian-model/)
134 |
--------------------------------------------------------------------------------
/bayesian_bootstrap/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from copy import deepcopy
3 |
4 |
5 | def mean(X, n_replications, seed=None):
6 | """Simulate the posterior distribution of the mean.
7 |
8 | Parameter X: The observed data (array like)
9 |
10 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
11 |
12 | Parameter seed: Seed for PRNG (default None)
13 |
14 | Returns: Samples from the posterior
15 | """
16 | weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications)
17 | return np.dot(X, weights.T)
18 |
19 |
20 | def var(X, n_replications, seed=None):
21 | """Simulate the posterior distribution of the variance.
22 |
23 | Parameter X: The observed data (array like)
24 |
25 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
26 |
27 | Parameter seed: Seed for PRNG (default None)
28 |
29 | Returns: Samples from the posterior
30 | """
31 | samples = []
32 | weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications)
33 | for w in weights:
34 | samples.append(np.dot([x ** 2 for x in X], w) - np.dot(X, w) ** 2)
35 | return samples
36 |
37 |
38 | def covar(X, Y, n_replications, seed=None):
39 | """Simulate the posterior distribution of the covariance.
40 |
41 | Parameter X: The observed data, first variable (array like)
42 |
43 | Parameter Y: The observed data, second (array like)
44 |
45 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
46 |
47 | Parameter seed: Seed for PRNG (default None)
48 |
49 | Returns: Samples from the posterior
50 | """
51 | samples = []
52 | weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications)
53 | for w in weights:
54 | cv = _weighted_covariance(X, Y, w)
55 | samples.append(cv)
56 | return samples
57 |
58 |
59 | def pearsonr(X, Y, n_replications, seed=None):
60 | """
61 | Pearson correlation coefficient and p-value for testing non-correlation.
62 |
63 | https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
64 |
65 | """
66 | weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications)
67 | return _weighted_pearsonr(X, Y, weights)
68 |
69 |
70 | def _weighted_covariance(X, Y, w):
71 | X_mean = np.dot(X, w.T).reshape(-1, 1)
72 | Y_mean = np.dot(Y, w.T).reshape(-1, 1)
73 | # Another approach, but less efficient
74 | # np.diag(np.dot(w, (x - X_mean) * (y - Y_mean)).T)
75 | # https://stackoverflow.com/a/14759273
76 | return (w * ((X - X_mean) * (Y - Y_mean))).sum(-1)
77 |
78 |
79 | def _weighted_pearsonr(X, Y, w):
80 | """
81 | Weighted Pearson correlation.
82 |
83 | """
84 | return _weighted_covariance(X, Y, w) / np.sqrt(_weighted_covariance(X, X, w) * _weighted_covariance(Y, Y, w))
85 |
86 |
87 | def _weighted_ls(X, w, y):
88 | x_rows, x_cols = X.shape
89 | w_matrix = np.array(w) * np.eye(x_rows)
90 | coef = np.dot(
91 | np.dot(np.dot(np.linalg.inv(np.dot(np.dot(X.T, w_matrix), X)), X.T), w_matrix),
92 | y,
93 | )
94 | return coef
95 |
96 |
97 | def linear_regression(X, y, n_replications, seed=None):
98 | coef_samples = []
99 | weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications)
100 | for w in weights:
101 | coef_samples.append(_weighted_ls(X, w, y))
102 | return np.vstack(coef_samples)
103 |
104 |
105 | def bayesian_bootstrap(X, statistic, n_replications, resample_size, low_mem=False, seed=None):
106 | """Simulate the posterior distribution of the given statistic.
107 |
108 | Parameter X: The observed data (array like)
109 |
110 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
111 |
112 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
113 |
114 | Parameter resample_size: The size of the dataset in each replication
115 |
116 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
117 | less memory, but will run slower as a result.
118 |
119 | Parameter seed: Seed for PRNG (default None)
120 |
121 | Returns: Samples from the posterior
122 | """
123 | if isinstance(X, list):
124 | X = np.array(X)
125 | samples = []
126 | rng = np.random.default_rng(seed)
127 | if low_mem:
128 | weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications))
129 | else:
130 | weights = rng.dirichlet([1] * len(X), n_replications)
131 | for w in weights:
132 | sample_index = rng.choice(range(len(X)), p=w, size=resample_size)
133 | resample_X = X[sample_index]
134 | s = statistic(resample_X)
135 | samples.append(s)
136 | return samples
137 |
138 |
139 | def bayesian_bootstrap_regression(X, y, statistic, n_replications, resample_size, low_mem=False, seed=None):
140 | """Simulate the posterior distribution of a statistic that uses dependent and independent variables.
141 |
142 | Parameter X: The observed data, independent variables (matrix like)
143 |
144 | Parameter y: The observed data, dependent variable (array like)
145 |
146 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
147 |
148 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
149 |
150 | Parameter resample_size: The size of the dataset in each replication
151 |
152 | Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
153 |
154 | Parameter seed: Seed for PRNG (default None)
155 |
156 | Returns: Samples from the posterior
157 | """
158 | samples = []
159 | X_arr = np.array(X)
160 | y_arr = np.array(y)
161 | rng = np.random.default_rng(seed)
162 | if low_mem:
163 | weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications))
164 | else:
165 | weights = rng.dirichlet([1] * len(X), n_replications)
166 | for w in weights:
167 | if resample_size is None:
168 | s = statistic(X, y, w)
169 | else:
170 | resample_i = rng.choice(range(len(X_arr)), p=w, size=resample_size)
171 | resample_X = X_arr[resample_i]
172 | resample_y = y_arr[resample_i]
173 | s = statistic(resample_X, resample_y)
174 | samples.append(s)
175 |
176 | return samples
177 |
178 |
179 | class BayesianBootstrapBagging:
180 | """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor."""
181 |
182 | def __init__(self, base_learner, n_replications, resample_size=None, low_mem=False, seed=None):
183 | """Initialize the base learners of the ensemble.
184 |
185 | Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
186 | method.
187 |
188 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
189 |
190 | Parameter resample_size: The size of the dataset in each replication
191 |
192 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
193 | less memory, but will run slower as a result.
194 |
195 | Parameter seed: Seed for PRNG (default None)
196 | """
197 | self.base_learner = base_learner
198 | self.n_replications = n_replications
199 | self.resample_size = resample_size
200 | self.memo = low_mem
201 | self.seed = seed
202 |
203 | def fit(self, X, y):
204 | """Fit the base learners of the ensemble on a dataset.
205 |
206 | Parameter X: The observed data, independent variables (matrix like)
207 |
208 | Parameter y: The observed data, dependent variable (array like)
209 |
210 | Returns: Fitted model
211 | """
212 | if self.resample_size is None:
213 | statistic = lambda X, y, w: deepcopy(self.base_learner).fit(X, y, w) # noqa: E731
214 | else:
215 | statistic = lambda X, y: deepcopy(self.base_learner).fit(X, y) # noqa: E731
216 | self.base_models_ = bayesian_bootstrap_regression(
217 | X, y, statistic, self.n_replications, self.resample_size, low_mem=self.memo, seed=self.seed
218 | )
219 | return self
220 |
221 | def predict(self, X):
222 | """Make average predictions for a collection of observations.
223 |
224 | Parameter X: The observed data, independent variables (matrix like)
225 |
226 | Returns: The predicted dependent variable values (array like)
227 | """
228 | y_posterior_samples = self.predict_posterior_samples(X)
229 | return np.array([np.mean(r) for r in y_posterior_samples])
230 |
231 | def predict_posterior_samples(self, X):
232 | """Simulate posterior samples for a collection of observations.
233 |
234 | Parameter X: The observed data, independent variables (matrix like)
235 |
236 | Returns: The simulated posterior mean (matrix like)
237 | """
238 | # Return a X_r x self.n_replications matrix
239 | y_posterior_samples = np.zeros((len(X), self.n_replications))
240 | for i, m in enumerate(self.base_models_):
241 | y_posterior_samples[:, i] = m.predict(X)
242 | return y_posterior_samples
243 |
244 | def predict_central_interval(self, X, alpha=0.05):
245 | """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
246 |
247 | Parameter X: The observed data, independent variables (matrix like)
248 |
249 | Parameter alpha: The total size of the tails (Float between 0 and 1)
250 |
251 | Returns: Left and right interval bounds for each input (matrix like)
252 | """
253 | y_posterior_samples = self.predict_posterior_samples(X)
254 | return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples])
255 |
256 | def predict_highest_density_interval(self, X, alpha=0.05):
257 | """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
258 |
259 | Parameter X: The observed data, independent variables (matrix like)
260 |
261 | Parameter alpha: The total size of the tails (Float between 0 and 1)
262 |
263 | Returns: Left and right interval bounds for each input (matrix like):
264 | """
265 | y_posterior_samples = self.predict_posterior_samples(X)
266 | return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples])
267 |
268 |
269 | def central_credible_interval(samples, alpha=0.05):
270 | """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
271 |
272 | Parameter samples: The posterior samples (array like)
273 |
274 | Parameter alpha: The total size of the tails (Float between 0 and 1)
275 |
276 | Returns: Left and right interval bounds (tuple)
277 | """
278 | return np.quantile(samples, alpha / 2), np.quantile(samples, 1 - alpha / 2)
279 |
280 |
281 | def highest_density_interval(samples, alpha=0.05):
282 | """The highest-density interval containing a (1-alpha) fraction of the posterior samples.
283 |
284 | Parameter samples: The posterior samples (array like)
285 |
286 | Parameter alpha: The total size of the tails (Float between 0 and 1)
287 |
288 | Returns: Left and right interval bounds (tuple)
289 | """
290 | samples_sorted = sorted(samples)
291 | window_size = int(len(samples) - round(len(samples) * alpha))
292 | smallest_window = (None, None)
293 | smallest_window_length = float("inf")
294 | for i in range(len(samples_sorted) - window_size):
295 | window = samples_sorted[i + window_size - 1], samples_sorted[i]
296 | window_length = samples_sorted[i + window_size - 1] - samples_sorted[i]
297 | if window_length < smallest_window_length:
298 | smallest_window_length = window_length
299 | smallest_window = window
300 | return smallest_window[1], smallest_window[0]
301 |
302 |
303 | def _bootstrap_replicate(X, seed=None):
304 | random_points = sorted(np.random.default_rng(seed).uniform(0, 1, len(X) - 1))
305 | random_points.append(1)
306 | random_points.insert(0, 0)
307 | gaps = [right - left for left, right in zip(random_points[:-1], random_points[1:])]
308 | return np.array(gaps)
309 |
--------------------------------------------------------------------------------
/bayesian_bootstrap/demos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmc2179/bayesian_bootstrap/93b8cf41b0675ec24a18e554f5011cdd07de7d91/bayesian_bootstrap/demos/__init__.py
--------------------------------------------------------------------------------
/bayesian_bootstrap/demos/demos.py:
--------------------------------------------------------------------------------
1 | from matplotlib import pyplot as plt
2 | import seaborn as sns
3 | from sklearn.linear_model import LinearRegression
4 | from sklearn.utils import resample
5 | from bayesian_bootstrap import (
6 | mean,
7 | var,
8 | bayesian_bootstrap,
9 | bayesian_bootstrap_regression,
10 | BayesianBootstrapBagging,
11 | highest_density_interval,
12 | covar,
13 | )
14 | from tqdm import tqdm
15 | import numpy as np
16 |
17 |
18 | def plot_mean_bootstrap():
19 | X = [-1, 0, 1]
20 | posterior_samples = mean(X, 10000)
21 | sns.distplot(posterior_samples)
22 | classical_samples = [np.mean(resample(X)) for _ in range(10000)]
23 | sns.distplot(classical_samples)
24 | plt.show()
25 |
26 |
27 | def plot_mean_resample_bootstrap():
28 | X = [-1, 0, 1]
29 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100)
30 | sns.distplot(posterior_samples)
31 | classical_samples = [np.mean(resample(X)) for _ in range(10000)]
32 | sns.distplot(classical_samples)
33 | plt.show()
34 |
35 |
36 | def plot_median():
37 | X = np.random.uniform(-1, 1, 10)
38 | posterior_samples = bayesian_bootstrap(X, np.median, 10000, 100)
39 | sns.distplot(posterior_samples)
40 | classical_samples = [np.median(resample(X)) for _ in range(10000)]
41 | sns.distplot(classical_samples)
42 | plt.show()
43 |
44 |
45 | def plot_var_bootstrap():
46 | X = np.random.uniform(-1, 1, 100)
47 | posterior_samples = var(X, 10000)
48 | sns.distplot(posterior_samples)
49 | classical_samples = [np.var(resample(X)) for _ in range(10000)]
50 | sns.distplot(classical_samples)
51 | plt.show()
52 |
53 |
54 | def plot_self_covar_bootstrap():
55 | X = np.random.uniform(-1, 1, 100)
56 | posterior_samples = covar(X, X, 10000)
57 | sns.distplot(posterior_samples)
58 | plt.show()
59 |
60 |
61 | def plot_covar_bootstrap():
62 | X = np.random.normal(0, 1, 100)
63 | Y = np.random.normal(0, 1, 100)
64 | posterior_samples = covar(X, Y, 10000)
65 | sns.distplot(posterior_samples)
66 | plt.show()
67 |
68 |
69 | def plot_var_resample_bootstrap():
70 | X = np.random.uniform(-1, 1, 100)
71 | posterior_samples = bayesian_bootstrap(X, np.var, 10000, 500)
72 | sns.distplot(posterior_samples)
73 | classical_samples = [np.var(resample(X)) for _ in range(10000)]
74 | sns.distplot(classical_samples)
75 | plt.show()
76 |
77 |
78 | def plot_mean_method_comparison():
79 | X = np.random.exponential(scale=1, size=8)
80 | classical_samples = [np.mean(resample(X)) for _ in range(10000)]
81 | posterior_samples_resample = bayesian_bootstrap(X, np.mean, 10000, 1000)
82 | posterior_samples_weighted = mean(X, 10000)
83 | sns.distplot(classical_samples)
84 | sns.distplot(posterior_samples_resample)
85 | sns.distplot(posterior_samples_weighted)
86 | plt.show()
87 |
88 |
89 | def plot_regression_bootstrap():
90 | X = np.array([[0], [1], [2], [3]])
91 | y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4)
92 | classical_samples = [LinearRegression().fit(*resample(X, y)).coef_ for _ in tqdm(range(10000))]
93 | posterior_samples = bayesian_bootstrap_regression(
94 | X, y, lambda X, y: LinearRegression().fit(X, y).coef_, 10000, 1000
95 | )
96 | plt.scatter(X.reshape(-1, 1), y)
97 | plt.show()
98 | sns.distplot(classical_samples)
99 | sns.distplot(posterior_samples)
100 | plt.show()
101 |
102 |
103 | def plot_regression_wrapper_bootstrap():
104 | X = np.array([[0], [1], [2], [3]])
105 | y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4)
106 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000)
107 | m.fit(X, y)
108 | y_predicted = m.predict(X)
109 | y_predicted_interval = m.predict_central_interval(X, 0.05)
110 | plt.scatter(X.reshape(-1, 1), y)
111 | plt.plot(X.reshape(-1, 1), y_predicted)
112 | plt.plot(X.reshape(-1, 1), y_predicted_interval[:, 0])
113 | plt.plot(X.reshape(-1, 1), y_predicted_interval[:, 1])
114 | plt.show()
115 |
116 |
117 | def plot_mean_bootstrap_exponential_readme():
118 | X = np.random.exponential(7, 4)
119 | classical_samples = [np.mean(resample(X)) for _ in range(10000)]
120 | posterior_samples = mean(X, 10000)
121 | l, r = highest_density_interval(posterior_samples)
122 | classical_l, classical_r = highest_density_interval(classical_samples)
123 | plt.subplot(2, 1, 1)
124 | plt.title("Bayesian Bootstrap of mean")
125 | sns.distplot(posterior_samples, label="Bayesian Bootstrap Samples")
126 | plt.plot([l, r], [0, 0], linewidth=5.0, marker="o", label="95% HDI")
127 | plt.xlim(-1, 18)
128 | plt.legend()
129 | plt.subplot(2, 1, 2)
130 | plt.title("Classical Bootstrap of mean")
131 | sns.distplot(classical_samples, label="Classical Bootstrap Samples")
132 | plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker="o", label="95% HDI")
133 | plt.xlim(-1, 18)
134 | plt.legend()
135 | plt.savefig("readme_exponential.png", bbox_inches="tight")
136 |
137 |
138 | def plot_regression_slope_distribution_readme():
139 | X = np.random.normal(0, 1, 5).reshape(-1, 1)
140 | y = X.reshape(1, -1).reshape(5) + np.random.normal(0, 1, 5)
141 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000)
142 | m.fit(X, y)
143 | X_plot = np.linspace(min(X), max(X))
144 | y_predicted = m.predict(X_plot.reshape(-1, 1))
145 | y_predicted_interval = m.predict_highest_density_interval(X_plot.reshape(-1, 1), 0.05)
146 | plt.scatter(X.reshape(1, -1), y)
147 | plt.plot(X_plot, y_predicted, label="Mean")
148 | plt.plot(X_plot, y_predicted_interval[:, 0], label="95% HDI Lower bound")
149 | plt.plot(X_plot, y_predicted_interval[:, 1], label="95% HDI Upper bound")
150 | plt.legend()
151 | plt.savefig("readme_regression.png", bbox_inches="tight")
152 |
153 |
154 | if __name__ == "__main__":
155 | # plot_mean_bootstrap()
156 | # plot_mean_resample_bootstrap()
157 | # plot_median()
158 | # plot_var_bootstrap()
159 | # plot_self_covar_bootstrap()
160 | plot_covar_bootstrap()
161 | # plot_var_resample_bootstrap()
162 | # plot_mean_method_comparison()
163 | # plot_regression_bootstrap()
164 | # plot_regression_wrapper_bootstrap()
165 | # plot_mean_bootstrap_exponential_readme()
166 | # plot_regression_slope_distribution_readme()
167 |
--------------------------------------------------------------------------------
/bayesian_bootstrap/demos/group_mean_secret_weapon.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from bayesian_bootstrap import mean, highest_density_interval
3 | from matplotlib import pyplot as plt
4 | import seaborn as sns # noqa: F401
5 |
6 |
7 | def plot_group_hdis(samples, labels, alpha, n_replications):
8 | for i, (s, l) in enumerate(zip(samples, labels)):
9 | posterior = mean(s, n_replications)
10 | l, r = highest_density_interval(posterior)
11 | plt.plot([i, i], [l, r])
12 | plt.plot([i], [np.mean(posterior)], marker="o")
13 | plt.xticks(range(len(labels)), labels)
14 |
15 |
16 | if __name__ == "__main__":
17 | samples = [
18 | np.random.normal(0, 1, 100),
19 | np.random.normal(0, 2, 100),
20 | np.random.normal(1, 1, 100),
21 | ]
22 | labels = ["0,1", "0,2", "1,1"]
23 | plot_group_hdis(samples, labels, 0.05, 10000)
24 | plt.show()
25 |
--------------------------------------------------------------------------------
/bayesian_bootstrap/demos/linear_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from matplotlib import pyplot as plt
3 | import seaborn as sns
4 | from bayesian_bootstrap import linear_regression
5 |
6 | X = np.linspace(-5, 5, 50)
7 | y = 2 * X + np.random.normal(0, 1, 50)
8 | results = linear_regression(X.reshape(-1, 1), y, 1000)
9 | sns.distplot(results[:, 0])
10 | plt.show()
11 |
--------------------------------------------------------------------------------
/bayesian_bootstrap/demos/readme_exponential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmc2179/bayesian_bootstrap/93b8cf41b0675ec24a18e554f5011cdd07de7d91/bayesian_bootstrap/demos/readme_exponential.png
--------------------------------------------------------------------------------
/bayesian_bootstrap/demos/readme_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmc2179/bayesian_bootstrap/93b8cf41b0675ec24a18e554f5011cdd07de7d91/bayesian_bootstrap/demos/readme_regression.png
--------------------------------------------------------------------------------
/bayesian_bootstrap/tests/test_bootstrap.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | import scipy
4 | import bayesian_bootstrap as bb
5 | from bayesian_bootstrap import (
6 | mean,
7 | var,
8 | bayesian_bootstrap,
9 | central_credible_interval,
10 | highest_density_interval,
11 | BayesianBootstrapBagging,
12 | covar,
13 | )
14 | from sklearn.linear_model import LinearRegression
15 |
16 | RNG = np.random.default_rng(1337) # repeatable pseudorandomness
17 |
18 |
19 | class TestMoments(unittest.TestCase):
20 | def test_mean(self):
21 | X = [-1, 0, 1]
22 | posterior_samples = mean(X, 10000)
23 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.015)
24 | self.assertAlmostEqual(len([s for s in posterior_samples if s < 0]), 5000, delta=1000)
25 |
26 | def test_variance(self):
27 | X = RNG.uniform(-1, 1, 500)
28 | posterior_samples = var(X, 10000)
29 | self.assertAlmostEqual(np.mean(posterior_samples), 1 / 3.0, delta=0.05)
30 |
31 | def test_self_covar(self):
32 | X = RNG.uniform(-1, 1, 500)
33 | posterior_samples = covar(X, X, 10000)
34 | self.assertAlmostEqual(np.mean(posterior_samples), np.var(X), delta=0.05)
35 |
36 | def test_covar(self):
37 | X = RNG.uniform(-1, 1, 500)
38 | Y = RNG.uniform(-1, 1, 500)
39 | posterior_samples = covar(X, Y, 10000)
40 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.05)
41 |
42 | def test_mean_resample(self):
43 | X = [-1, 0, 1]
44 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100, low_mem=True)
45 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.01)
46 | self.assertAlmostEqual(len([s for s in posterior_samples if s < 0]), 5000, delta=1000)
47 | posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100, low_mem=False)
48 | self.assertAlmostEqual(np.mean(posterior_samples), 0, delta=0.01)
49 | self.assertAlmostEqual(len([s for s in posterior_samples if s < 0]), 5000, delta=1000)
50 |
51 | def test_var_resample(self):
52 | X = RNG.uniform(-1, 1, 500)
53 | posterior_samples = bayesian_bootstrap(X, np.var, 10000, 5000, low_mem=True)
54 | self.assertAlmostEqual(np.mean(posterior_samples), 1 / 3.0, delta=0.05)
55 | X = RNG.uniform(-1, 1, 500)
56 | posterior_samples = bayesian_bootstrap(X, np.var, 10000, 5000, low_mem=False)
57 | self.assertAlmostEqual(np.mean(posterior_samples), 1 / 3.0, delta=0.05)
58 |
59 |
60 | class TestIntervals(unittest.TestCase):
61 | def test_central_credible_interval(self):
62 | l, r = central_credible_interval(self._shuffle(range(10)), alpha=0.2)
63 | self.assertEqual(l, 0.9)
64 | self.assertEqual(r, 8.1)
65 | l, r = central_credible_interval(self._shuffle(range(10)), alpha=0.19)
66 | self.assertEqual(l, 0.855)
67 | self.assertEqual(r, 8.145)
68 | l, r = central_credible_interval(self._shuffle(range(20)), alpha=0.1)
69 | self.assertAlmostEqual(l, 0.95)
70 | self.assertEqual(r, 18.05)
71 |
72 | def test_hpdi(self):
73 | l, r = highest_density_interval(self._shuffle([0, 10, 1] + [1.1] * 7), alpha=0.2)
74 | self.assertEqual(l, 1)
75 | self.assertEqual(r, 1.1)
76 | l, r = highest_density_interval(self._shuffle([0, 10, 1.1, 1]), alpha=0.5)
77 | self.assertEqual(l, 1)
78 | self.assertEqual(r, 1.1)
79 |
80 | def _shuffle(self, x):
81 | x = list(x)
82 | RNG.shuffle(x)
83 | return x
84 |
85 |
86 | class TestRegression(unittest.TestCase):
87 | def test_parameter_estimation_resampling_low_memory(self):
88 | X = RNG.uniform(0, 4, 1000)
89 | y = X + RNG.normal(0, 1, 1000)
90 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000, low_mem=True)
91 | m.fit(X.reshape(-1, 1), y)
92 | coef_samples = [b.coef_ for b in m.base_models_]
93 | intercept_samples = [b.intercept_ for b in m.base_models_]
94 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3)
95 | l, r = central_credible_interval(coef_samples, alpha=0.05)
96 | self.assertLess(l, 1)
97 | self.assertGreater(r, 1)
98 | l, r = highest_density_interval(coef_samples, alpha=0.05)
99 | self.assertLess(l, 1)
100 | self.assertGreater(r, 1)
101 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
102 | l, r = central_credible_interval(intercept_samples, alpha=0.05)
103 | self.assertLess(l, 0)
104 | self.assertGreater(r, 0)
105 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
106 | l, r = highest_density_interval(intercept_samples, alpha=0.05)
107 | self.assertLess(l, 0)
108 | self.assertGreater(r, 0)
109 |
110 | def test_parameter_estimation_resampling(self):
111 | X = RNG.uniform(0, 4, 1000)
112 | y = X + RNG.normal(0, 1, 1000)
113 | m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000, low_mem=False)
114 | m.fit(X.reshape(-1, 1), y)
115 | coef_samples = [b.coef_ for b in m.base_models_]
116 | intercept_samples = [b.intercept_ for b in m.base_models_]
117 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3)
118 | l, r = central_credible_interval(coef_samples, alpha=0.05)
119 | self.assertLess(l, 1)
120 | self.assertGreater(r, 1)
121 | l, r = highest_density_interval(coef_samples, alpha=0.05)
122 | self.assertLess(l, 1)
123 | self.assertGreater(r, 1)
124 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
125 | l, r = central_credible_interval(intercept_samples, alpha=0.05)
126 | self.assertLess(l, 0)
127 | self.assertGreater(r, 0)
128 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
129 | l, r = highest_density_interval(intercept_samples, alpha=0.05)
130 | self.assertLess(l, 0)
131 | self.assertGreater(r, 0)
132 |
133 | def test_parameter_estimation_bayes(self):
134 | X = RNG.uniform(0, 4, 1000)
135 | y = X + RNG.normal(0, 1, 1000)
136 | m = BayesianBootstrapBagging(LinearRegression(), 10000, low_mem=False)
137 | m.fit(X.reshape(-1, 1), y)
138 | coef_samples = [b.coef_ for b in m.base_models_]
139 | intercept_samples = [b.intercept_ for b in m.base_models_]
140 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3)
141 | l, r = central_credible_interval(coef_samples, alpha=0.05)
142 | self.assertLess(l, 1)
143 | self.assertGreater(r, 1)
144 | l, r = highest_density_interval(coef_samples, alpha=0.05)
145 | self.assertLess(l, 1)
146 | self.assertGreater(r, 1)
147 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
148 | l, r = central_credible_interval(intercept_samples, alpha=0.05)
149 | self.assertLess(l, 0)
150 | self.assertGreater(r, 0)
151 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
152 | l, r = highest_density_interval(intercept_samples, alpha=0.05)
153 | self.assertLess(l, 0)
154 | self.assertGreater(r, 0)
155 |
156 | def test_parameter_estimation_bayes_low_memory(self):
157 | X = RNG.uniform(0, 4, 1000)
158 | y = X + RNG.normal(0, 1, 1000)
159 | m = BayesianBootstrapBagging(LinearRegression(), 10000, low_mem=True)
160 | m.fit(X.reshape(-1, 1), y)
161 | coef_samples = [b.coef_ for b in m.base_models_]
162 | intercept_samples = [b.intercept_ for b in m.base_models_]
163 | self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3)
164 | l, r = central_credible_interval(coef_samples, alpha=0.05)
165 | self.assertLess(l, 1)
166 | self.assertGreater(r, 1)
167 | l, r = highest_density_interval(coef_samples, alpha=0.05)
168 | self.assertLess(l, 1)
169 | self.assertGreater(r, 1)
170 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
171 | l, r = central_credible_interval(intercept_samples, alpha=0.05)
172 | self.assertLess(l, 0)
173 | self.assertGreater(r, 0)
174 | self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
175 | l, r = highest_density_interval(intercept_samples, alpha=0.05)
176 | self.assertLess(l, 0)
177 | self.assertGreater(r, 0)
178 |
179 |
180 | def test_pearsonr():
181 | x = np.linspace(0, 5, 10)
182 | y = np.linspace(0, 5, 10)
183 | assert np.mean(bb.pearsonr(x, y, 10000)) == 1
184 | assert np.mean(bb.pearsonr(x, -y, 10000)) == -1
185 |
186 | x = [0, 1, 3, 6]
187 | y = [1, 2, 5, 7]
188 | assert np.isclose(np.mean(bb.pearsonr(x, y, 10000)), scipy.stats.pearsonr(x, y)[0], atol=0.001)
189 |
190 | x = np.linspace(-10, 10, 10000)
191 | y = np.abs(x)
192 | assert np.isclose(scipy.stats.pearsonr(x, y)[0], np.mean(bb.pearsonr(x, y, 1000)), atol=0.001)
193 |
194 |
195 | if __name__ == "__main__":
196 | unittest.main()
197 |
--------------------------------------------------------------------------------
/docs/bootstrap_documentation.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | bayesian_bootstrap.bootstrap API documentation
7 |
8 |
9 |
10 |
11 |
551 |
552 |
853 |
854 |
921 |
922 |
1014 |
1015 |
1029 |
1030 |
1031 | Top
1032 |
1033 |
1034 |
1035 |
1036 |
1075 |
1076 |
1077 |
1078 |
1079 |
1080 |
1081 |
1082 |
1083 |
1084 | bayesian_bootstrap.bootstrap module
1085 |
1086 |
1087 | Show source ≡
1088 |
1089 |
import numpy as np
1090 | from copy import deepcopy
1091 |
1092 | def mean ( X , n_replications ):
1093 | """Simulate the posterior distribution of the mean.
1094 |
1095 | Parameter X: The observed data (array like)
1096 |
1097 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1098 |
1099 | Returns: Samples from the posterior
1100 | """
1101 | samples = []
1102 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1103 | for w in weights :
1104 | samples . append ( np . dot ( X , w ))
1105 | return samples
1106 |
1107 | def var ( X , n_replications ):
1108 | """Simulate the posterior distribution of the variance.
1109 |
1110 | Parameter X: The observed data (array like)
1111 |
1112 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1113 |
1114 | Returns: Samples from the posterior
1115 | """
1116 | samples = []
1117 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1118 | for w in weights :
1119 | samples . append ( np . dot ([ x ** 2 for x in X ], w ) - np . dot ( X , w ) ** 2 )
1120 | return samples
1121 |
1122 | def covar ( X , Y , n_replications ):
1123 | """Simulate the posterior distribution of the covariance.
1124 |
1125 | Parameter X: The observed data, first variable (array like)
1126 |
1127 | Parameter Y: The observed data, second (array like)
1128 |
1129 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1130 |
1131 | Returns: Samples from the posterior
1132 | """
1133 | samples = []
1134 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1135 | for w in weights :
1136 | X_mean = np . dot ( X , w )
1137 | Y_mean = np . dot ( Y , w )
1138 | samples . append ( np . dot ( w , ( X - X_mean ) * ( Y - Y_mean )))
1139 | return samples
1140 |
1141 | def bayesian_bootstrap ( X , statistic , n_replications , resample_size , low_mem = False ):
1142 | """Simulate the posterior distribution of the given statistic.
1143 |
1144 | Parameter X: The observed data (array like)
1145 |
1146 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1147 |
1148 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1149 |
1150 | Parameter resample_size: The size of the dataset in each replication
1151 |
1152 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1153 | less memory, but will run slower as a result.
1154 |
1155 | Returns: Samples from the posterior
1156 | """
1157 | if isinstance ( X , list ):
1158 | X = np . array ( X )
1159 | samples = []
1160 | if low_mem :
1161 | weights = ( np . random . dirichlet ([ 1 ] * len ( X )) for _ in range ( n_replications ))
1162 | else :
1163 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1164 | for w in weights :
1165 | sample_index = np . random . choice ( range ( len ( X )), p = w , size = resample_size )
1166 | resample_X = X [ sample_index ]
1167 | s = statistic ( resample_X )
1168 | samples . append ( s )
1169 | return samples
1170 |
1171 | def bayesian_bootstrap_regression ( X , y , statistic , n_replications , resample_size , low_mem = False ):
1172 | """Simulate the posterior distribution of a statistic that uses dependent and independent variables.
1173 |
1174 | Parameter X: The observed data, independent variables (matrix like)
1175 |
1176 | Parameter y: The observed data, dependent variable (array like)
1177 |
1178 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1179 |
1180 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1181 |
1182 | Parameter resample_size: The size of the dataset in each replication
1183 |
1184 | Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
1185 |
1186 | Returns: Samples from the posterior
1187 | """
1188 | samples = []
1189 | X_arr = np . array ( X )
1190 | y_arr = np . array ( y )
1191 | if low_mem :
1192 | weights = ( np . random . dirichlet ([ 1 ] * len ( X )) for _ in range ( n_replications ))
1193 | else :
1194 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1195 | for w in weights :
1196 | resample_i = np . random . choice ( range ( len ( X_arr )), p = w , size = resample_size )
1197 | resample_X = X_arr [ resample_i ]
1198 | resample_y = y_arr [ resample_i ]
1199 | s = statistic ( resample_X , resample_y )
1200 | samples . append ( s )
1201 |
1202 | return samples
1203 |
1204 | class BayesianBootstrapBagging ( object ):
1205 | """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor."""
1206 | def __init__ ( self , base_learner , n_replications , resample_size , low_mem = False ):
1207 | """Initialize the base learners of the ensemble.
1208 |
1209 | Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1210 | method.
1211 |
1212 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1213 |
1214 | Parameter resample_size: The size of the dataset in each replication
1215 |
1216 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1217 | less memory, but will run slower as a result.
1218 | """
1219 | self . base_learner = base_learner
1220 | self . n_replications = n_replications
1221 | self . resample_size = resample_size
1222 | self . memo = low_mem
1223 |
1224 | def fit ( self , X , y ):
1225 | """Fit the base learners of the ensemble on a dataset.
1226 |
1227 | Parameter X: The observed data, independent variables (matrix like)
1228 |
1229 | Parameter y: The observed data, dependent variable (array like)
1230 |
1231 | Returns: Fitted model
1232 | """
1233 | self . base_models_ = bayesian_bootstrap_regression ( X ,
1234 | y ,
1235 | lambda X , y : deepcopy ( self . base_learner ) . fit ( X , y ),
1236 | self . n_replications ,
1237 | self . resample_size ,
1238 | low_mem = self . memo )
1239 | return self
1240 |
1241 | def predict ( self , X ):
1242 | """Make average predictions for a collection of observations.
1243 |
1244 | Parameter X: The observed data, independent variables (matrix like)
1245 |
1246 | Returns: The predicted dependent variable values (array like)
1247 | """
1248 | y_posterior_samples = self . predict_posterior_samples ( X )
1249 | return np . array ([ np . mean ( r ) for r in y_posterior_samples ])
1250 |
1251 | def predict_posterior_samples ( self , X ):
1252 | """Simulate posterior samples for a collection of observations.
1253 |
1254 | Parameter X: The observed data, independent variables (matrix like)
1255 |
1256 | Returns: The simulated posterior mean (matrix like)
1257 | """
1258 | # Return a X_r x self.n_replications matrix
1259 | y_posterior_samples = np . zeros (( len ( X ), self . n_replications ))
1260 | for i , m in enumerate ( self . base_models_ ):
1261 | y_posterior_samples [:, i ] = m . predict ( X )
1262 | return y_posterior_samples
1263 |
1264 | def predict_central_interval ( self , X , alpha = 0.05 ):
1265 | """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1266 |
1267 | Parameter X: The observed data, independent variables (matrix like)
1268 |
1269 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1270 |
1271 | Returns: Left and right interval bounds for each input (matrix like)
1272 | """
1273 | y_posterior_samples = self . predict_posterior_samples ( X )
1274 | return np . array ([ central_credible_interval ( r , alpha = alpha ) for r in y_posterior_samples ])
1275 |
1276 | def predict_highest_density_interval ( self , X , alpha = 0.05 ):
1277 | """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1278 |
1279 | Parameter X: The observed data, independent variables (matrix like)
1280 |
1281 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1282 |
1283 | Returns: Left and right interval bounds for each input (matrix like):
1284 | """
1285 | y_posterior_samples = self . predict_posterior_samples ( X )
1286 | return np . array ([ highest_density_interval ( r , alpha = alpha ) for r in y_posterior_samples ])
1287 |
1288 | def central_credible_interval ( samples , alpha = 0.05 ):
1289 | """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
1290 |
1291 | Parameter samples: The posterior samples (array like)
1292 |
1293 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1294 |
1295 | Returns: Left and right interval bounds (tuple)
1296 | """
1297 | tail_size = int ( round ( len ( samples ) * ( alpha / 2 )))
1298 | samples_sorted = sorted ( samples )
1299 | return samples_sorted [ tail_size ], samples_sorted [ - tail_size - 1 ]
1300 |
1301 | def highest_density_interval ( samples , alpha = 0.05 ):
1302 | """The highest-density interval containing a (1-alpha) fraction of the posterior samples.
1303 |
1304 | Parameter samples: The posterior samples (array like)
1305 |
1306 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1307 |
1308 | Returns: Left and right interval bounds (tuple)
1309 | """
1310 | samples_sorted = sorted ( samples )
1311 | window_size = int ( len ( samples ) - round ( len ( samples ) * alpha ))
1312 | smallest_window = ( None , None )
1313 | smallest_window_length = float ( 'inf' )
1314 | for i in range ( len ( samples_sorted ) - window_size ):
1315 | window = samples_sorted [ i + window_size - 1 ], samples_sorted [ i ]
1316 | window_length = samples_sorted [ i + window_size - 1 ] - samples_sorted [ i ]
1317 | if window_length < smallest_window_length :
1318 | smallest_window_length = window_length
1319 | smallest_window = window
1320 | return smallest_window [ 1 ], smallest_window [ 0 ]
1321 |
1322 | def _bootstrap_replicate ( X ):
1323 | random_points = [ 0 ] + sorted ( np . random . uniform ( 0 , 1 , len ( X ) - 1 )) + [ 1 ]
1324 | gaps = [ r - l for l , r in zip ( random_points [: - 1 ], random_points [ 1 :])]
1325 | return gaps
1326 |
1327 |
1328 |
1329 |
1330 |
1331 |
1332 |
1333 |
1334 |
1335 |
1336 |
1337 |
1338 |
def bayesian_bootstrap (
X, statistic, n_replications, resample_size, low_mem=False)
1339 |
1340 |
1341 |
1342 |
1343 |
1344 |
Simulate the posterior distribution of the given statistic.
1345 |
Parameter X: The observed data (array like)
1346 |
Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1347 |
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1348 |
Parameter resample_size: The size of the dataset in each replication
1349 |
Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1350 | less memory, but will run slower as a result.
1351 |
Returns: Samples from the posterior
1352 |
1353 |
Show source ≡
1354 |
1355 |
def bayesian_bootstrap ( X , statistic , n_replications , resample_size , low_mem = False ):
1356 | """Simulate the posterior distribution of the given statistic.
1357 |
1358 | Parameter X: The observed data (array like)
1359 |
1360 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1361 |
1362 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1363 |
1364 | Parameter resample_size: The size of the dataset in each replication
1365 |
1366 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1367 | less memory, but will run slower as a result.
1368 |
1369 | Returns: Samples from the posterior
1370 | """
1371 | if isinstance ( X , list ):
1372 | X = np . array ( X )
1373 | samples = []
1374 | if low_mem :
1375 | weights = ( np . random . dirichlet ([ 1 ] * len ( X )) for _ in range ( n_replications ))
1376 | else :
1377 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1378 | for w in weights :
1379 | sample_index = np . random . choice ( range ( len ( X )), p = w , size = resample_size )
1380 | resample_X = X [ sample_index ]
1381 | s = statistic ( resample_X )
1382 | samples . append ( s )
1383 | return samples
1384 |
1385 |
1386 |
1387 |
1388 |
1389 |
1390 |
1391 |
1392 |
1393 |
1394 |
def bayesian_bootstrap_regression (
X, y, statistic, n_replications, resample_size, low_mem=False)
1395 |
1396 |
1397 |
1398 |
1399 |
1400 |
Simulate the posterior distribution of a statistic that uses dependent and independent variables.
1401 |
Parameter X: The observed data, independent variables (matrix like)
1402 |
Parameter y: The observed data, dependent variable (array like)
1403 |
Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1404 |
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1405 |
Parameter resample_size: The size of the dataset in each replication
1406 |
Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
1407 |
Returns: Samples from the posterior
1408 |
1409 |
Show source ≡
1410 |
1411 |
def bayesian_bootstrap_regression ( X , y , statistic , n_replications , resample_size , low_mem = False ):
1412 | """Simulate the posterior distribution of a statistic that uses dependent and independent variables.
1413 |
1414 | Parameter X: The observed data, independent variables (matrix like)
1415 |
1416 | Parameter y: The observed data, dependent variable (array like)
1417 |
1418 | Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
1419 |
1420 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1421 |
1422 | Parameter resample_size: The size of the dataset in each replication
1423 |
1424 | Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
1425 |
1426 | Returns: Samples from the posterior
1427 | """
1428 | samples = []
1429 | X_arr = np . array ( X )
1430 | y_arr = np . array ( y )
1431 | if low_mem :
1432 | weights = ( np . random . dirichlet ([ 1 ] * len ( X )) for _ in range ( n_replications ))
1433 | else :
1434 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1435 | for w in weights :
1436 | resample_i = np . random . choice ( range ( len ( X_arr )), p = w , size = resample_size )
1437 | resample_X = X_arr [ resample_i ]
1438 | resample_y = y_arr [ resample_i ]
1439 | s = statistic ( resample_X , resample_y )
1440 | samples . append ( s )
1441 |
1442 | return samples
1443 |
1444 |
1445 |
1446 |
1447 |
1448 |
1449 |
1450 |
1451 |
1452 |
1453 |
def central_credible_interval (
samples, alpha=0.05)
1454 |
1455 |
1456 |
1457 |
1458 |
1459 |
The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
1460 |
Parameter samples: The posterior samples (array like)
1461 |
Parameter alpha: The total size of the tails (Float between 0 and 1)
1462 |
Returns: Left and right interval bounds (tuple)
1463 |
1464 |
Show source ≡
1465 |
1466 |
def central_credible_interval ( samples , alpha = 0.05 ):
1467 | """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
1468 |
1469 | Parameter samples: The posterior samples (array like)
1470 |
1471 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1472 |
1473 | Returns: Left and right interval bounds (tuple)
1474 | """
1475 | tail_size = int ( round ( len ( samples ) * ( alpha / 2 )))
1476 | samples_sorted = sorted ( samples )
1477 | return samples_sorted [ tail_size ], samples_sorted [ - tail_size - 1 ]
1478 |
1479 |
1480 |
1481 |
1482 |
1483 |
1484 |
1485 |
1486 |
1487 |
1488 |
def covar (
X, Y, n_replications)
1489 |
1490 |
1491 |
1492 |
1493 |
1494 |
Simulate the posterior distribution of the covariance.
1495 |
Parameter X: The observed data, first variable (array like)
1496 |
Parameter Y: The observed data, second (array like)
1497 |
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1498 |
Returns: Samples from the posterior
1499 |
1500 |
Show source ≡
1501 |
1502 |
def covar ( X , Y , n_replications ):
1503 | """Simulate the posterior distribution of the covariance.
1504 |
1505 | Parameter X: The observed data, first variable (array like)
1506 |
1507 | Parameter Y: The observed data, second (array like)
1508 |
1509 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1510 |
1511 | Returns: Samples from the posterior
1512 | """
1513 | samples = []
1514 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1515 | for w in weights :
1516 | X_mean = np . dot ( X , w )
1517 | Y_mean = np . dot ( Y , w )
1518 | samples . append ( np . dot ( w , ( X - X_mean ) * ( Y - Y_mean )))
1519 | return samples
1520 |
1521 |
1522 |
1523 |
1524 |
1525 |
1526 |
1527 |
1528 |
1529 |
1530 |
def highest_density_interval (
samples, alpha=0.05)
1531 |
1532 |
1533 |
1534 |
1535 |
1536 |
The highest-density interval containing a (1-alpha) fraction of the posterior samples.
1537 |
Parameter samples: The posterior samples (array like)
1538 |
Parameter alpha: The total size of the tails (Float between 0 and 1)
1539 |
Returns: Left and right interval bounds (tuple)
1540 |
1541 |
Show source ≡
1542 |
1543 |
def highest_density_interval ( samples , alpha = 0.05 ):
1544 | """The highest-density interval containing a (1-alpha) fraction of the posterior samples.
1545 |
1546 | Parameter samples: The posterior samples (array like)
1547 |
1548 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1549 |
1550 | Returns: Left and right interval bounds (tuple)
1551 | """
1552 | samples_sorted = sorted ( samples )
1553 | window_size = int ( len ( samples ) - round ( len ( samples ) * alpha ))
1554 | smallest_window = ( None , None )
1555 | smallest_window_length = float ( 'inf' )
1556 | for i in range ( len ( samples_sorted ) - window_size ):
1557 | window = samples_sorted [ i + window_size - 1 ], samples_sorted [ i ]
1558 | window_length = samples_sorted [ i + window_size - 1 ] - samples_sorted [ i ]
1559 | if window_length < smallest_window_length :
1560 | smallest_window_length = window_length
1561 | smallest_window = window
1562 | return smallest_window [ 1 ], smallest_window [ 0 ]
1563 |
1564 |
1565 |
1566 |
1567 |
1568 |
1569 |
1570 |
1571 |
1572 |
1573 |
def mean (
X, n_replications)
1574 |
1575 |
1576 |
1577 |
1578 |
1579 |
Simulate the posterior distribution of the mean.
1580 |
Parameter X: The observed data (array like)
1581 |
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1582 |
Returns: Samples from the posterior
1583 |
1584 |
Show source ≡
1585 |
1586 |
def mean ( X , n_replications ):
1587 | """Simulate the posterior distribution of the mean.
1588 |
1589 | Parameter X: The observed data (array like)
1590 |
1591 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1592 |
1593 | Returns: Samples from the posterior
1594 | """
1595 | samples = []
1596 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1597 | for w in weights :
1598 | samples . append ( np . dot ( X , w ))
1599 | return samples
1600 |
1601 |
1602 |
1603 |
1604 |
1605 |
1606 |
1607 |
1608 |
1609 |
1610 |
def var (
X, n_replications)
1611 |
1612 |
1613 |
1614 |
1615 |
1616 |
Simulate the posterior distribution of the variance.
1617 |
Parameter X: The observed data (array like)
1618 |
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1619 |
Returns: Samples from the posterior
1620 |
1621 |
Show source ≡
1622 |
1623 |
def var ( X , n_replications ):
1624 | """Simulate the posterior distribution of the variance.
1625 |
1626 | Parameter X: The observed data (array like)
1627 |
1628 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1629 |
1630 | Returns: Samples from the posterior
1631 | """
1632 | samples = []
1633 | weights = np . random . dirichlet ([ 1 ] * len ( X ), n_replications )
1634 | for w in weights :
1635 | samples . append ( np . dot ([ x ** 2 for x in X ], w ) - np . dot ( X , w ) ** 2 )
1636 | return samples
1637 |
1638 |
1639 |
1640 |
1641 |
1642 |
1643 |
1644 |
1645 |
1646 |
1647 |
1648 |
class BayesianBootstrapBagging
1649 |
1650 |
1651 |
A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor.
1652 |
1653 |
Show source ≡
1654 |
1655 |
class BayesianBootstrapBagging ( object ):
1656 | """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor."""
1657 | def __init__ ( self , base_learner , n_replications , resample_size , low_mem = False ):
1658 | """Initialize the base learners of the ensemble.
1659 |
1660 | Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1661 | method.
1662 |
1663 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1664 |
1665 | Parameter resample_size: The size of the dataset in each replication
1666 |
1667 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1668 | less memory, but will run slower as a result.
1669 | """
1670 | self . base_learner = base_learner
1671 | self . n_replications = n_replications
1672 | self . resample_size = resample_size
1673 | self . memo = low_mem
1674 |
1675 | def fit ( self , X , y ):
1676 | """Fit the base learners of the ensemble on a dataset.
1677 |
1678 | Parameter X: The observed data, independent variables (matrix like)
1679 |
1680 | Parameter y: The observed data, dependent variable (array like)
1681 |
1682 | Returns: Fitted model
1683 | """
1684 | self . base_models_ = bayesian_bootstrap_regression ( X ,
1685 | y ,
1686 | lambda X , y : deepcopy ( self . base_learner ) . fit ( X , y ),
1687 | self . n_replications ,
1688 | self . resample_size ,
1689 | low_mem = self . memo )
1690 | return self
1691 |
1692 | def predict ( self , X ):
1693 | """Make average predictions for a collection of observations.
1694 |
1695 | Parameter X: The observed data, independent variables (matrix like)
1696 |
1697 | Returns: The predicted dependent variable values (array like)
1698 | """
1699 | y_posterior_samples = self . predict_posterior_samples ( X )
1700 | return np . array ([ np . mean ( r ) for r in y_posterior_samples ])
1701 |
1702 | def predict_posterior_samples ( self , X ):
1703 | """Simulate posterior samples for a collection of observations.
1704 |
1705 | Parameter X: The observed data, independent variables (matrix like)
1706 |
1707 | Returns: The simulated posterior mean (matrix like)
1708 | """
1709 | # Return a X_r x self.n_replications matrix
1710 | y_posterior_samples = np . zeros (( len ( X ), self . n_replications ))
1711 | for i , m in enumerate ( self . base_models_ ):
1712 | y_posterior_samples [:, i ] = m . predict ( X )
1713 | return y_posterior_samples
1714 |
1715 | def predict_central_interval ( self , X , alpha = 0.05 ):
1716 | """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1717 |
1718 | Parameter X: The observed data, independent variables (matrix like)
1719 |
1720 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1721 |
1722 | Returns: Left and right interval bounds for each input (matrix like)
1723 | """
1724 | y_posterior_samples = self . predict_posterior_samples ( X )
1725 | return np . array ([ central_credible_interval ( r , alpha = alpha ) for r in y_posterior_samples ])
1726 |
1727 | def predict_highest_density_interval ( self , X , alpha = 0.05 ):
1728 | """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1729 |
1730 | Parameter X: The observed data, independent variables (matrix like)
1731 |
1732 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1733 |
1734 | Returns: Left and right interval bounds for each input (matrix like):
1735 | """
1736 | y_posterior_samples = self . predict_posterior_samples ( X )
1737 | return np . array ([ highest_density_interval ( r , alpha = alpha ) for r in y_posterior_samples ])
1738 |
1739 |
1740 |
1741 |
1742 |
1743 |
1744 |
1745 |
Ancestors (in MRO)
1746 |
1750 |
Static methods
1751 |
1752 |
1753 |
1754 |
def __init__ (
self, base_learner, n_replications, resample_size, low_mem=False)
1755 |
1756 |
1757 |
1758 |
1759 |
1760 |
Initialize the base learners of the ensemble.
1761 |
Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1762 | method.
1763 |
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1764 |
Parameter resample_size: The size of the dataset in each replication
1765 |
Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1766 | less memory, but will run slower as a result.
1767 |
1768 |
Show source ≡
1769 |
1770 |
def __init__ ( self , base_learner , n_replications , resample_size , low_mem = False ):
1771 | """Initialize the base learners of the ensemble.
1772 | Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
1773 | method.
1774 | Parameter n_replications: The number of bootstrap replications to perform (positive integer)
1775 | Parameter resample_size: The size of the dataset in each replication
1776 |
1777 | Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
1778 | less memory, but will run slower as a result.
1779 | """
1780 | self . base_learner = base_learner
1781 | self . n_replications = n_replications
1782 | self . resample_size = resample_size
1783 | self . memo = low_mem
1784 |
1785 |
1786 |
1787 |
1788 |
1789 |
1790 |
1791 |
1792 |
1793 |
1794 |
def fit (
self, X, y)
1795 |
1796 |
1797 |
1798 |
1799 |
1800 |
Fit the base learners of the ensemble on a dataset.
1801 |
Parameter X: The observed data, independent variables (matrix like)
1802 |
Parameter y: The observed data, dependent variable (array like)
1803 |
Returns: Fitted model
1804 |
1805 |
Show source ≡
1806 |
1807 |
def fit ( self , X , y ):
1808 | """Fit the base learners of the ensemble on a dataset.
1809 | Parameter X: The observed data, independent variables (matrix like)
1810 | Parameter y: The observed data, dependent variable (array like)
1811 | Returns: Fitted model
1812 | """
1813 | self . base_models_ = bayesian_bootstrap_regression ( X ,
1814 | y ,
1815 | lambda X , y : deepcopy ( self . base_learner ) . fit ( X , y ),
1816 | self . n_replications ,
1817 | self . resample_size ,
1818 | low_mem = self . memo )
1819 | return self
1820 |
1821 |
1822 |
1823 |
1824 |
1825 |
1826 |
1827 |
1828 |
1829 |
1830 |
def predict (
self, X)
1831 |
1832 |
1833 |
1834 |
1835 |
1836 |
Make average predictions for a collection of observations.
1837 |
Parameter X: The observed data, independent variables (matrix like)
1838 |
Returns: The predicted dependent variable values (array like)
1839 |
1840 |
Show source ≡
1841 |
1842 |
def predict ( self , X ):
1843 | """Make average predictions for a collection of observations.
1844 | Parameter X: The observed data, independent variables (matrix like)
1845 | Returns: The predicted dependent variable values (array like)
1846 | """
1847 | y_posterior_samples = self . predict_posterior_samples ( X )
1848 | return np . array ([ np . mean ( r ) for r in y_posterior_samples ])
1849 |
1850 |
1851 |
1852 |
1853 |
1854 |
1855 |
1856 |
1857 |
1858 |
1859 |
def predict_central_interval (
self, X, alpha=0.05)
1860 |
1861 |
1862 |
1863 |
1864 |
1865 |
The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1866 |
Parameter X: The observed data, independent variables (matrix like)
1867 |
Parameter alpha: The total size of the tails (Float between 0 and 1)
1868 |
Returns: Left and right interval bounds for each input (matrix like)
1869 |
1870 |
Show source ≡
1871 |
1872 |
def predict_central_interval ( self , X , alpha = 0.05 ):
1873 | """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
1874 | Parameter X: The observed data, independent variables (matrix like)
1875 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1876 | Returns: Left and right interval bounds for each input (matrix like)
1877 | """
1878 | y_posterior_samples = self . predict_posterior_samples ( X )
1879 | return np . array ([ central_credible_interval ( r , alpha = alpha ) for r in y_posterior_samples ])
1880 |
1881 |
1882 |
1883 |
1884 |
1885 |
1886 |
1887 |
1888 |
1889 |
1890 |
def predict_highest_density_interval (
self, X, alpha=0.05)
1891 |
1892 |
1893 |
1894 |
1895 |
1896 |
The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1897 |
Parameter X: The observed data, independent variables (matrix like)
1898 |
Parameter alpha: The total size of the tails (Float between 0 and 1)
1899 |
Returns: Left and right interval bounds for each input (matrix like):
1900 |
1901 |
Show source ≡
1902 |
1903 |
def predict_highest_density_interval ( self , X , alpha = 0.05 ):
1904 | """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
1905 | Parameter X: The observed data, independent variables (matrix like)
1906 | Parameter alpha: The total size of the tails (Float between 0 and 1)
1907 | Returns: Left and right interval bounds for each input (matrix like):
1908 | """
1909 | y_posterior_samples = self . predict_posterior_samples ( X )
1910 | return np . array ([ highest_density_interval ( r , alpha = alpha ) for r in y_posterior_samples ])
1911 |
1912 |
1913 |
1914 |
1915 |
1916 |
1917 |
1918 |
1919 |
1920 |
1921 |
def predict_posterior_samples (
self, X)
1922 |
1923 |
1924 |
1925 |
1926 |
1927 |
Simulate posterior samples for a collection of observations.
1928 |
Parameter X: The observed data, independent variables (matrix like)
1929 |
Returns: The simulated posterior mean (matrix like)
1930 |
1931 |
Show source ≡
1932 |
1933 |
def predict_posterior_samples ( self , X ):
1934 | """Simulate posterior samples for a collection of observations.
1935 | Parameter X: The observed data, independent variables (matrix like)
1936 | Returns: The simulated posterior mean (matrix like)
1937 | """
1938 | # Return a X_r x self.n_replications matrix
1939 | y_posterior_samples = np . zeros (( len ( X ), self . n_replications ))
1940 | for i , m in enumerate ( self . base_models_ ):
1941 | y_posterior_samples [:, i ] = m . predict ( X )
1942 | return y_posterior_samples
1943 |
1944 |
1945 |
1946 |
1947 |
1948 |
1949 |
1950 |
Instance variables
1951 |
1952 |
var base_learner
1953 |
1954 |
1955 |
1956 |
1957 |
1958 |
1959 |
1960 |
1961 |
1962 |
var memo
1963 |
1964 |
1965 |
1966 |
1967 |
1968 |
1969 |
1970 |
1971 |
1972 |
var n_replications
1973 |
1974 |
1975 |
1976 |
1977 |
1978 |
1979 |
1980 |
1981 |
1982 |
var resample_size
1983 |
1984 |
1985 |
1986 |
1987 |
1988 |
1989 |
1990 |
1991 |
1992 |
1993 |
1994 |
1995 |
1996 |
1997 |
1998 |
2009 |
2010 |
2011 |