├── .coveragerc
├── .github
    └── workflows
    │   ├── docbuild.yml
    │   └── pythonpackage.yml
├── .gitignore
├── .nojekyll
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   ├── css
    │   │   └── project-template.css
    │   ├── eScience_Logo_HR.png
    │   └── js
    │   │   └── copybutton.js
    ├── _templates
    │   ├── class.rst
    │   ├── function.rst
    │   └── numpydoc_docstring.py
    ├── api.rst
    ├── conf.py
    ├── contributing.rst
    ├── index.rst
    └── installation_guide.rst
├── examples
    ├── README.txt
    ├── plot_mpg.py
    ├── plot_mpg_svr.py
    └── plot_spam.py
├── forestci
    ├── __init__.py
    ├── calibration.py
    ├── due.py
    ├── forestci.py
    ├── tests
    │   ├── __init__.py
    │   └── test_forestci.py
    └── version.py
├── paper
    ├── paper.bib
    ├── paper.html
    ├── paper.md
    ├── plot_mpg.png
    ├── plot_mpg_no_variance.png
    ├── plot_spam.png
    └── plot_spam_no_variance.png
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | show_missing = True
3 | 


--------------------------------------------------------------------------------
/.github/workflows/docbuild.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation build
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: [3.9]
13 | 
14 |     steps:
15 |     - name: Checkout repo
16 |       uses: actions/checkout@v4
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install -r requirements.txt
25 |         pip install -r requirements-dev.txt
26 |         pip install .
27 |     - name: Build docs
28 |       run: |
29 |         cd docs
30 |         make html
31 |     - name: Upload docs
32 |       uses: actions/upload-artifact@v4
33 |       with:
34 |         name: docs
35 |         path: docs/_build/html
36 |     - name: Publish docs to Github Pages
37 |       if: startsWith(github.event.ref, 'refs/tags')
38 |       uses: JamesIves/github-pages-deploy-action@releases/v4
39 |       with:
40 |         token: ${{ secrets.GITHUB_TOKEN }}
41 |         branch: gh-pages # The branch the action should deploy to.
42 |         folder: 'docs/_build/html' # The folder the action should deploy.
43 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: [3.9]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 |         pip install -r requirements-dev.txt
25 |         pip install .
26 |     - name: Test with pytest
27 |       run: |
28 |         pytest forestci --doctest-modules
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # scikit-learn specific
10 | doc/_build/
11 | doc/auto_examples/
12 | doc/modules/generated/
13 | doc/datasets/generated/
14 | 
15 | # Distribution / packaging
16 | 
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | 
34 | # PyInstaller
35 | #  Usually these files are written by a python script from a template
36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 | 
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 | 
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *,cover
53 | .hypothesis/
54 | 
55 | # Translations
56 | *.mo
57 | *.pot
58 | 
59 | # Django stuff:
60 | *.log
61 | 
62 | # Sphinx documentation
63 | docs/_build/
64 | 
65 | # PyBuilder
66 | target/
67 | 
68 | # Mac files
69 | .DS_Store
70 | 
71 | # Paper files
72 | paper/references/*
73 | paper/*.txt
74 | 
75 | # Jupiter notebooks
76 | .ipynb_checkpoints
77 | 
78 | # pyenv environment
79 | .python-version


--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/.nojekyll


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2016, Ariel Rokem, Bryna Hazelton, Kivan Polimis (The University of Washington eScience Institute)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `forestci`: confidence intervals for Forest algorithms
  2 | 
  3 | [![Travis Status](https://travis-ci.org/scikit-learn-contrib/forest-confidence-interval.svg?branch=master)](https://travis-ci.org/scikit-learn-contrib/forest-confidence-interval)
  4 | [![Coveralls Status](https://coveralls.io/repos/scikit-learn-contrib/forest-confidence-interval/badge.svg?branch=master&service=github)](https://coveralls.io/r/scikit-learn-contrib/forest-confidence-interval)
  5 | [![CircleCI Status](https://circleci.com/gh/scikit-learn-contrib/forest-confidence-interval.svg?style=shield&circle-token=:circle-token)](https://circleci.com/gh/scikit-learn-contrib/forest-confidence-interval/tree/master)
  6 | [![status](http://joss.theoj.org/papers/b40f03cc069b43b341a92bd26b660f35/status.svg)](http://joss.theoj.org/papers/b40f03cc069b43b341a92bd26b660f35)
  7 | 
  8 | Forest algorithms are powerful [ensemble methods](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble) for classification and regression. 
  9 | However, predictions from these algorithms do contain some amount of error. 
 10 | Prediction variability can illustrate how influential
 11 | the training set is for producing the observed random forest predictions.
 12 | 
 13 | `forest-confidence-interval` is a Python module that adds a calculation of
 14 | variance and computes confidence intervals to the basic functionality
 15 | implemented in scikit-learn random forest regression or classification objects.
 16 | The core functions calculate an in-bag and error bars for random forest
 17 | objects.
 18 | 
 19 | This module is based on R code from Stefan Wager 
 20 | ([`randomForestCI`](https://github.com/swager/randomForestCI) deprecated in favor of [`grf`](https://github.com/swager/grf))
 21 | and is licensed under the MIT open source license (see [LICENSE](LICENSE)).
 22 | The present project makes the algorithm compatible with [`scikit-learn`](https://scikit-learn.org/stable/).
 23 | 
 24 | To get the proper confidence interval, you need to use a large number of trees (estimators). 
 25 | The [calibration routine](https://github.com/scikit-learn-contrib/forest-confidence-interval/pull/114) 
 26 | (which can be included or excluded on top of the algorithm) tries to extrapolate
 27 | the results for an infinite number of trees, but it is instable and it can cause numerical errors:
 28 | if this is the case, the suggestion is to exclude it with `calibrate=False` 
 29 | and test increasing the number of trees in the model to reach convergence.
 30 | 
 31 | ## Installation and Usage
 32 | 
 33 | Before installing the module you will need `numpy`, `scipy` and `scikit-learn`.
 34 | 
 35 | To install `forest-confidence-interval` execute:
 36 | ```
 37 | pip install forestci
 38 | ```
 39 | If would like to install the development version of the software use:
 40 | 
 41 | ```shell
 42 | pip install git+git://github.com/scikit-learn-contrib/forest-confidence-interval.git
 43 | ```
 44 | 
 45 | Usage:
 46 | 
 47 | ```python
 48 | import forestci as fci
 49 | ci = fci.random_forest_error(
 50 |   forest=model, # scikit-learn Forest model fitted on X_train
 51 |   X_train_shape=X_train.shape,
 52 |   X_test=X, # the samples you want to compute the CI
 53 |   inbag=None,
 54 |   calibrate=True,
 55 |   memory_constrained=False,
 56 |   memory_limit=None,
 57 |   y_output=0 # in case of multioutput model, consider target 0
 58 | )
 59 | ```
 60 | 
 61 | ## Examples
 62 | 
 63 | The examples (gallery below) demonstrates the package functionality with random forest classifiers and regression models.
 64 | The regression example uses a popular UCI Machine Learning data set on cars while the classifier example simulates how to add measurements of uncertainty to tasks like predicting spam emails.
 65 | 
 66 | [Examples gallery](http://contrib.scikit-learn.org/forest-confidence-interval/auto_examples/index.html)
 67 | 
 68 | ## Contributing
 69 | 
 70 | Contributions are very welcome, but we ask that contributors abide by the
 71 | [contributor covenant](http://contributor-covenant.org/version/1/4/).
 72 | 
 73 | To report issues with the software, please post to the
 74 | [issue log](https://github.com/scikit-learn-contrib/forest-confidence-interval/issues)
 75 | Bug reports are also appreciated, please add them to the issue log after
 76 | verifying that the issue does not already exist.
 77 | Comments on existing issues are also welcome.
 78 | 
 79 | Please submit improvements as pull requests against the repo after verifying
 80 | that the existing tests pass and any new code is well covered by unit tests.
 81 | Please write code that complies with the Python style guide,
 82 | [PEP8](https://www.python.org/dev/peps/pep-0008/).
 83 | 
 84 | E-mail [Ariel Rokem](mailto:arokem@gmail.com), [Kivan Polimis](mailto:kivan.polimis@gmail.com), or [Bryna Hazelton](mailto:brynah@phys.washington.edu ) if you have any questions, suggestions or feedback.
 85 | 
 86 | ## Testing
 87 | 
 88 | Requires installation of `pytest` package.
 89 | 
 90 | Tests are located in the `forestci/tests` folder and can be run with this command in the root directory:
 91 | 
 92 | ```shell
 93 | pytest forestci --doctest-modules
 94 | ```
 95 | 
 96 | ## Citation
 97 | 
 98 | Click on the JOSS status badge for the Journal of Open Source Software article on this project.
 99 | The BibTeX citation for the JOSS article is below:
100 | 
101 | ```
102 | @article{polimisconfidence,
103 |   title={Confidence Intervals for Random Forests in Python},
104 |   author={Polimis, Kivan and Rokem, Ariel and Hazelton, Bryna},
105 |   journal={Journal of Open Source Software},
106 |   volume={2},
107 |   number={1},
108 |   year={2017}
109 | }
110 | ```
111 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	-rm -rf $(BUILDDIR)/*
 51 | 	-rm -rf auto_examples/
 52 | 	-rm -rf generated/*
 53 | 	-rm -rf modules/generated/*
 54 | 
 55 | html:
 56 | 	# These two lines make the build a bit more lengthy, and the
 57 | 	# the embedding of images more robust
 58 | 	rm -rf $(BUILDDIR)/html/_images
 59 | 	#rm -rf _build/doctrees/
 60 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 63 | 
 64 | dirhtml:
 65 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 68 | 
 69 | singlehtml:
 70 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 71 | 	@echo
 72 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 73 | 
 74 | pickle:
 75 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the pickle files."
 78 | 
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | htmlhelp:
 85 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 86 | 	@echo
 87 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 88 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 89 | 
 90 | qthelp:
 91 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 92 | 	@echo
 93 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 94 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 95 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fracridge.qhcp"
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fracridge.qhc"
 98 | 
 99 | devhelp:
100 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
101 | 	@echo
102 | 	@echo "Build finished."
103 | 	@echo "To view the help file:"
104 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/fracridge"
105 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fracridge"
106 | 	@echo "# devhelp"
107 | 
108 | epub:
109 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
110 | 	@echo
111 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
112 | 
113 | latex:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo
116 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
117 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
118 | 	      "(use \`make latexpdf' here to do that automatically)."
119 | 
120 | latexpdf:
121 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | 	@echo "Running LaTeX files through pdflatex..."
123 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
124 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
125 | 
126 | latexpdfja:
127 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
128 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
129 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
130 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
131 | 
132 | text:
133 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
134 | 	@echo
135 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
136 | 
137 | man:
138 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
139 | 	@echo
140 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
141 | 
142 | texinfo:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo
145 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
146 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
147 | 	      "(use \`make info' here to do that automatically)."
148 | 
149 | info:
150 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
151 | 	@echo "Running Texinfo files through makeinfo..."
152 | 	make -C $(BUILDDIR)/texinfo info
153 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
154 | 
155 | gettext:
156 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
157 | 	@echo
158 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
159 | 
160 | changes:
161 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
162 | 	@echo
163 | 	@echo "The overview file is in $(BUILDDIR)/changes."
164 | 
165 | linkcheck:
166 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
167 | 	@echo
168 | 	@echo "Link check complete; look for any errors in the above output " \
169 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
170 | 
171 | doctest:
172 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
173 | 	@echo "Testing of doctests in the sources finished, look at the " \
174 | 	      "results in $(BUILDDIR)/doctest/output.txt."
175 | 
176 | xml:
177 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
178 | 	@echo
179 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
180 | 
181 | pseudoxml:
182 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
183 | 	@echo
184 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
185 | 


--------------------------------------------------------------------------------
/docs/_static/css/project-template.css:
--------------------------------------------------------------------------------
 1 | @import url("theme.css");
 2 | 
 3 | .highlight a {
 4 |     text-decoration: underline;
 5 | }
 6 | 
 7 | .deprecated p {
 8 |     padding: 10px 7px 10px 10px;
 9 |     color: #b94a48;
10 |     background-color: #F3E5E5;
11 |     border: 1px solid #eed3d7;
12 | }
13 | 
14 | .deprecated p span.versionmodified {
15 |     font-weight: bold;
16 | }
17 | 


--------------------------------------------------------------------------------
/docs/_static/eScience_Logo_HR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/docs/_static/eScience_Logo_HR.png


--------------------------------------------------------------------------------
/docs/_static/js/copybutton.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function() {
 2 |     /* Add a [>>>] button on the top-right corner of code samples to hide
 3 |      * the >>> and ... prompts and the output and thus make the code
 4 |      * copyable. */
 5 |     var div = $('.highlight-python .highlight,' +
 6 |                 '.highlight-python3 .highlight,' +
 7 |                 '.highlight-pycon .highlight,' +
 8 | 		'.highlight-default .highlight')
 9 |     var pre = div.find('pre');
10 | 
11 |     // get the styles from the current theme
12 |     pre.parent().parent().css('position', 'relative');
13 |     var hide_text = 'Hide the prompts and output';
14 |     var show_text = 'Show the prompts and output';
15 |     var border_width = pre.css('border-top-width');
16 |     var border_style = pre.css('border-top-style');
17 |     var border_color = pre.css('border-top-color');
18 |     var button_styles = {
19 |         'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
20 |         'border-color': border_color, 'border-style': border_style,
21 |         'border-width': border_width, 'color': border_color, 'text-size': '75%',
22 |         'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em',
23 |         'border-radius': '0 3px 0 0'
24 |     }
25 | 
26 |     // create and add the button to all the code blocks that contain >>>
27 |     div.each(function(index) {
28 |         var jthis = $(this);
29 |         if (jthis.find('.gp').length > 0) {
30 |             var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
31 |             button.css(button_styles)
32 |             button.attr('title', hide_text);
33 |             button.data('hidden', 'false');
34 |             jthis.prepend(button);
35 |         }
36 |         // tracebacks (.gt) contain bare text elements that need to be
37 |         // wrapped in a span to work with .nextUntil() (see later)
38 |         jthis.find('pre:has(.gt)').contents().filter(function() {
39 |             return ((this.nodeType == 3) && (this.data.trim().length > 0));
40 |         }).wrap('<span>');
41 |     });
42 | 
43 |     // define the behavior of the button when it's clicked
44 |     $('.copybutton').click(function(e){
45 |         e.preventDefault();
46 |         var button = $(this);
47 |         if (button.data('hidden') === 'false') {
48 |             // hide the code output
49 |             button.parent().find('.go, .gp, .gt').hide();
50 |             button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
51 |             button.css('text-decoration', 'line-through');
52 |             button.attr('title', show_text);
53 |             button.data('hidden', 'true');
54 |         } else {
55 |             // show the code output
56 |             button.parent().find('.go, .gp, .gt').show();
57 |             button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
58 |             button.css('text-decoration', 'none');
59 |             button.attr('title', hide_text);
60 |             button.data('hidden', 'false');
61 |         }
62 |     });
63 | });
64 | 


--------------------------------------------------------------------------------
/docs/_templates/class.rst:
--------------------------------------------------------------------------------
 1 | :mod:`{{module}}`.{{objname}}
 2 | {{ underline }}==============
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 | 
 8 |    {% block methods %}
 9 |    .. automethod:: __init__
10 |    {% endblock %}
11 | 
12 | .. include:: {{module}}.{{objname}}.examples
13 | 
14 | .. raw:: html
15 | 
16 |     <div style='clear:both'></div>
17 | 


--------------------------------------------------------------------------------
/docs/_templates/function.rst:
--------------------------------------------------------------------------------
 1 | :mod:`{{module}}`.{{objname}}
 2 | {{ underline }}====================
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autofunction:: {{ objname }}
 7 | 
 8 | .. include:: {{module}}.{{objname}}.examples
 9 | 
10 | .. raw:: html
11 | 
12 |     <div style='clear:both'></div>
13 | 


--------------------------------------------------------------------------------
/docs/_templates/numpydoc_docstring.py:
--------------------------------------------------------------------------------
 1 | {{index}}
 2 | {{summary}}
 3 | {{extended_summary}}
 4 | {{parameters}}
 5 | {{returns}}
 6 | {{yields}}
 7 | {{other_parameters}}
 8 | {{attributes}}
 9 | {{raises}}
10 | {{warns}}
11 | {{warnings}}
12 | {{see_also}}
13 | {{notes}}
14 | {{references}}
15 | {{examples}}
16 | {{methods}}
17 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | ####################
 2 | ``forestci`` API
 3 | ####################
 4 | 
 5 | 
 6 | .. autosummary::
 7 |    :toctree: generated/
 8 |    :template: function.rst
 9 | 
10 |    forestci.random_forest_error
11 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # forestci sphinx configuration file, based on the sklearn
  3 | # project-template documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Jan 18 14:44:12 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | import sphinx_gallery
 19 | import sphinx_rtd_theme
 20 | 
 21 | # If extensions (or modules to document with autodoc) are in another directory,
 22 | # add these directories to sys.path here. If the directory is relative to the
 23 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 24 | #sys.path.insert(0, os.path.abspath('.'))
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.autosummary',
 37 |     'sphinx.ext.doctest',
 38 |     'sphinx.ext.intersphinx',
 39 |     'sphinx.ext.viewcode',
 40 |     'numpydoc',
 41 |     'sphinx_gallery.gen_gallery',
 42 |     'sphinx.ext.githubpages',
 43 | ]
 44 | 
 45 | 
 46 | # this is needed for some reason...
 47 | # see https://github.com/numpy/numpydoc/issues/69
 48 | numpydoc_show_class_members = False
 49 | 
 50 | # pngmath / imgmath compatibility layer for different sphinx versions
 51 | import sphinx
 52 | from distutils.version import LooseVersion
 53 | if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
 54 |     extensions.append('sphinx.ext.pngmath')
 55 | else:
 56 |     extensions.append('sphinx.ext.imgmath')
 57 | 
 58 | # autodoc_default_flags = ['members', 'inherited-members']
 59 | 
 60 | # autodoc_default_options = True
 61 | 
 62 | autodoc_default_options = {
 63 |     'members': True,
 64 |     'inherited-members': True}
 65 | 
 66 | # Add any paths that contain templates here, relative to this directory.
 67 | templates_path = ['_templates']
 68 | 
 69 | # generate autosummary even if no references
 70 | autosummary_generate = True
 71 | 
 72 | # The suffix of source filenames.
 73 | source_suffix = '.rst'
 74 | 
 75 | # The encoding of source files.
 76 | #source_encoding = 'utf-8-sig'
 77 | 
 78 | # Generate the plots for the gallery
 79 | # plot_gallery = True
 80 | 
 81 | # The master toctree document.
 82 | master_doc = 'index'
 83 | 
 84 | # -- Project information -----------------------------------------------------
 85 | 
 86 | project = 'forestci'
 87 | copyright = '2016--, Kivan Polimis, Ariel Rokem, Bryna Hazelton, The University of Washington'
 88 | author = "Kivan Polimis, Ariel Rokem, and Bryna Hazelton"
 89 | 
 90 | # The version info for the project you're documenting, acts as replacement for
 91 | # |version| and |release|, also used in various other places throughout the
 92 | # built documents.
 93 | #
 94 | # The short X.Y version.
 95 | from forestci import __version__
 96 | version = __version__
 97 | # The full version, including alpha/beta/rc tags.
 98 | release = __version__
 99 | 
100 | # The language for content autogenerated by Sphinx. Refer to documentation
101 | # for a list of supported languages.
102 | #language = None
103 | 
104 | # There are two options for replacing |today|: either, you set today to some
105 | # non-false value, then it is used:
106 | #today = ''
107 | # Else, today_fmt is used as the format for a strftime call.
108 | #today_fmt = '%B %d, %Y'
109 | 
110 | # List of patterns, relative to source directory, that match files and
111 | # directories to ignore when looking for source files.
112 | exclude_patterns = ['_build', '_templates']
113 | 
114 | # The reST default role (used for this markup: `text`) to use for all
115 | # documents.
116 | #default_role = None
117 | 
118 | # If true, '()' will be appended to :func: etc. cross-reference text.
119 | #add_function_parentheses = True
120 | 
121 | # If true, the current module name will be prepended to all description
122 | # unit titles (such as .. function::).
123 | add_module_names = False
124 | 
125 | # If true, sectionauthor and moduleauthor directives will be shown in the
126 | # output. They are ignored by default.
127 | #show_authors = False
128 | 
129 | # The name of the Pygments (syntax highlighting) style to use.
130 | pygments_style = 'sphinx'
131 | 
132 | # Custom style
133 | html_style = 'css/project-template.css'
134 | 
135 | # A list of ignored prefixes for module index sorting.
136 | #modindex_common_prefix = []
137 | 
138 | # If true, keep warnings as "system message" paragraphs in the built documents.
139 | #keep_warnings = False
140 | 
141 | 
142 | # -- Options for HTML output ----------------------------------------------
143 | 
144 | # The theme to use for HTML and HTML Help pages.  See the documentation for
145 | # a list of builtin themes.
146 | html_theme = 'sphinx_rtd_theme'
147 | 
148 | # Theme options are theme-specific and customize the look and feel of a theme
149 | # further.  For a list of options available for each theme, see the
150 | # documentation.
151 | #html_theme_options = {}
152 | 
153 | # Add any paths that contain custom themes here, relative to this directory.
154 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
155 | 
156 | # The name for this set of Sphinx documents.  If None, it defaults to
157 | # "<project> v<release> documentation".
158 | #html_title = None
159 | 
160 | # A shorter title for the navigation bar.  Default is the same as html_title.
161 | #html_short_title = None
162 | 
163 | # The name of an image file (relative to this directory) to place at the top
164 | # of the sidebar.
165 | #html_logo = None
166 | 
167 | # The name of an image file (within the static path) to use as favicon of the
168 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
169 | # pixels large.
170 | #html_favicon = None
171 | 
172 | # Add any paths that contain custom static files (such as style sheets) here,
173 | # relative to this directory. They are copied after the builtin static files,
174 | # so a file named "default.css" will overwrite the builtin "default.css".
175 | html_static_path = ['_static']
176 | 
177 | # Add any extra paths that contain custom files (such as robots.txt or
178 | # .htaccess) here, relative to this directory. These files are copied
179 | # directly to the root of the documentation.
180 | #html_extra_path = []
181 | 
182 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
183 | # using the given strftime format.
184 | #html_last_updated_fmt = '%b %d, %Y'
185 | 
186 | # If true, SmartyPants will be used to convert quotes and dashes to
187 | # typographically correct entities.
188 | #html_use_smartypants = True
189 | 
190 | # Custom sidebar templates, maps document names to template names.
191 | #html_sidebars = {}
192 | 
193 | # Additional templates that should be rendered to pages, maps page names to
194 | # template names.
195 | #html_additional_pages = {}
196 | 
197 | # If false, no module index is generated.
198 | #html_domain_indices = True
199 | 
200 | # If false, no index is generated.
201 | #html_use_index = True
202 | 
203 | # If true, the index is split into individual pages for each letter.
204 | #html_split_index = False
205 | 
206 | # If true, links to the reST sources are added to the pages.
207 | #html_show_sourcelink = True
208 | 
209 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
210 | #html_show_sphinx = True
211 | 
212 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
213 | #html_show_copyright = True
214 | 
215 | # If true, an OpenSearch description file will be output, and all pages will
216 | # contain a <link> tag referring to it.  The value of this option must be the
217 | # base URL from which the finished HTML is served.
218 | #html_use_opensearch = ''
219 | 
220 | # This is the file name suffix for HTML files (e.g. ".xhtml").
221 | #html_file_suffix = None
222 | 
223 | # Output file base name for HTML help builder.
224 | htmlhelp_basename = 'forestcidoc'
225 | 
226 | 
227 | # Example configuration for intersphinx: refer to the Python standard library.
228 | # intersphinx configuration
229 | intersphinx_mapping = {
230 |     'python': ('https://docs.python.org/{.major}'.format(
231 |         sys.version_info), None),
232 |     'numpy': ('https://docs.scipy.org/doc/numpy/', None),
233 |     'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
234 |     'matplotlib': ('https://matplotlib.org/', None),
235 |     'sklearn': ('http://scikit-learn.org/stable', None)
236 | }
237 | 
238 | # sphinx-gallery configuration
239 | sphinx_gallery_conf = {
240 |     'doc_module': 'forestci',
241 |     'backreferences_dir': os.path.join('generated'),
242 |     'reference_url': {
243 |         'forestci': None}
244 | }
245 | 
246 | def setup(app):
247 |     # a copy button to copy snippet of code from the documentation
248 |     app.add_js_file('js/copybutton.js')
249 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
  1 | ##########################################
  2 | Contributing to ``forestci`` development
  3 | ##########################################
  4 | 
  5 | We welcome suggestions and contributions. To contribute to the software
  6 | please submit a pull request with the proposed changes. If you are planning
  7 | a large change to the software, it is a good idea to first submit an issue
  8 | to discuss the changes that you are proposing to implement.
  9 | 
 10 | 
 11 | Code of Conduct
 12 | ----------------
 13 | 
 14 | We aim to make the use of this project and contribution to it a harassment-free
 15 | experience for everyone, regardless of age, body size, visible or invisible
 16 | disability, ethnicity, sex characteristics, gender identity and expression,
 17 | level of experience, education, socio-economic status, nationality, personal
 18 | appearance, race, religion, or sexual identity and orientation.
 19 | 
 20 | We pledge to act and interact in ways that contribute to an open, welcoming,
 21 | diverse, inclusive, and healthy community.
 22 | 
 23 | Our Standards
 24 | ~~~~~~~~~~~~~~
 25 | 
 26 | Examples of behavior that contributes to a positive environment for our
 27 | community include:
 28 | 
 29 | * Demonstrating empathy and kindness toward other people
 30 | * Being respectful of differing opinions, viewpoints, and experiences
 31 | * Giving and gracefully accepting constructive feedback
 32 | * Accepting responsibility and apologizing to those affected by our mistakes,
 33 |   and learning from the experience
 34 | * Focusing on what is best not just for us as individuals, but for the
 35 |   overall community
 36 | 
 37 | Examples of unacceptable behavior include:
 38 | 
 39 | * The use of sexualized language or imagery, and sexual attention or
 40 |   advances of any kind
 41 | * Trolling, insulting or derogatory comments, and personal or political attacks
 42 | * Public or private harassment
 43 | * Publishing others' private information, such as a physical or email
 44 |   address, without their explicit permission
 45 | * Other conduct which could reasonably be considered inappropriate in a
 46 |   professional setting
 47 | 
 48 | Enforcement Responsibilities
 49 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 50 | 
 51 | Community leaders are responsible for clarifying and enforcing our standards of
 52 | acceptable behavior and will take appropriate and fair corrective action in
 53 | response to any behavior that they deem inappropriate, threatening, offensive,
 54 | or harmful.
 55 | 
 56 | Community leaders have the right and responsibility to remove, edit, or reject
 57 | comments, commits, code, wiki edits, issues, and other contributions that are
 58 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 59 | decisions when appropriate.
 60 | 
 61 | Scope
 62 | ~~~~~
 63 | 
 64 | This Code of Conduct applies within all community spaces, and also applies when
 65 | an individual is officially representing the community in public spaces.
 66 | Examples of representing our community include using an official e-mail address,
 67 | posting via an official social media account, or acting as an appointed
 68 | representative at an online or offline event.
 69 | 
 70 | Enforcement
 71 | ~~~~~~~~~~~~~~
 72 | 
 73 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 74 | reported to the community leaders responsible for enforcement at
 75 | `arokem@gmail.com <mailto:arokem@gmail.com>`_
 76 | All complaints will be reviewed and investigated promptly and fairly.
 77 | 
 78 | All community leaders are obligated to respect the privacy and security of the
 79 | reporter of any incident.
 80 | 
 81 | Enforcement Guidelines
 82 | ~~~~~~~~~~~~~~~~~~~~~~~
 83 | Community leaders will follow these Community Impact Guidelines in determining
 84 | the consequences for any action they deem in violation of this Code of Conduct:
 85 | 
 86 | 1. Correction
 87 | 
 88 | **Community Impact**: Use of inappropriate language or other behavior deemed
 89 | unprofessional or unwelcome in the community.
 90 | 
 91 | **Consequence**: A private, written warning from community leaders, providing
 92 | clarity around the nature of the violation and an explanation of why the
 93 | behavior was inappropriate. A public apology may be requested.
 94 | 
 95 | 2. Warning
 96 | 
 97 | **Community Impact**: A violation through a single incident or series
 98 | of actions.
 99 | 
100 | **Consequence**: A warning with consequences for continued behavior. No
101 | interaction with the people involved, including unsolicited interaction with
102 | those enforcing the Code of Conduct, for a specified period of time. This
103 | includes avoiding interactions in community spaces as well as external channels
104 | like social media. Violating these terms may lead to a temporary or
105 | permanent ban.
106 | 
107 | 3. Temporary Ban
108 | 
109 | **Community Impact**: A serious violation of community standards, including
110 | sustained inappropriate behavior.
111 | 
112 | **Consequence**: A temporary ban from any sort of interaction or public
113 | communication with the community for a specified period of time. No public or
114 | private interaction with the people involved, including unsolicited interaction
115 | with those enforcing the Code of Conduct, is allowed during this period.
116 | Violating these terms may lead to a permanent ban.
117 | 
118 | 4. Permanent Ban
119 | 
120 | **Community Impact**: Demonstrating a pattern of violation of community
121 | standards, including sustained inappropriate behavior,  harassment of an
122 | individual, or aggression toward or disparagement of classes of individuals.
123 | 
124 | **Consequence**: A permanent ban from any sort of public interaction within
125 | the community.
126 | 
127 | Attribution
128 | ~~~~~~~~~~~~
129 | This Code of Conduct is adapted from the Contributor Covenant homepage
130 | version 2.0, available at:
131 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
132 | 
133 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
134 | enforcement ladder](https://github.com/mozilla/diversity).
135 | 
136 | For answers to common questions about this code of conduct, see the FAQ at
137 | https://www.contributor-covenant.org/faq. Translations are available at
138 | https://www.contributor-covenant.org/translations.
139 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Confidence Intervals for Scikit Learn Random Forests
 3 | =====================================================
 4 | 
 5 | Random forest algorithms are useful for both classification and regression
 6 | problems. This package adds to scikit-learn the ability to calculate confidence
 7 | intervals of the predictions generated from scikit-learn
 8 | :class:`sklearn.ensemble.RandomForestRegressor` and :class:`sklearn.ensemble.RandomForestClassifier` objects.
 9 | 
10 | This is an implementation of an algorithm developed by Wager et al. [Wager2014]_
11 | and previously implemented in R (`here <https://github.com/swager/randomForestCI>`_).
12 | 
13 | To examine and download the source code, visit our `github repo <https://github.com/scikit-learn-contrib/forest-confidence-interval#readme>`_.
14 | 
15 | .. [Wager2014] S. Wager, T. Hastie, B. Efron. "Confidence Intervals for
16 |        Random Forests: The Jackknife and the Infinitesimal Jackknife", Journal
17 |        of Machine Learning Research vol. 15, pp. 1625-1651, 2014.
18 | 
19 | .. toctree::
20 |    :maxdepth: 2
21 | 
22 |    installation_guide
23 |    api
24 |    auto_examples/index
25 |    contributing
26 | 
27 | .. figure:: _static/eScience_Logo_HR.png
28 |    :align: center
29 |    :figclass: align-center
30 |    :target: http://escience.washington.edu
31 | 
32 |    Acknowledgements: this work was supported by a grant from the
33 |    `Gordon & Betty Moore Foundation <https://www.moore.org/>`_,  and from the
34 |    `Alfred P. Sloan Foundation <http://www.sloan.org/>`_ to the
35 |    `University of Washington eScience Institute <http://escience.washington.edu/>`_ , and through a grant from the `Bill & Melinda Gates Foundation <http://www.gatesfoundation.org/>`_.
36 | 


--------------------------------------------------------------------------------
/docs/installation_guide.rst:
--------------------------------------------------------------------------------
 1 | .. _installation_guide:
 2 | 
 3 | Installation Guide
 4 | ==================
 5 | 
 6 | Before installing the `forestci` module, you will need `numpy`, `scipy`
 7 | and `scikit-learn`
 8 | 
 9 | .. code-block:: bash
10 | 
11 |    pip install numpy scipy scikit-learn
12 | 
13 | Then, to install `forestci`:
14 | 
15 | .. code-block:: bash
16 | 
17 |    pip install forestci
18 | 
19 | If you wish to install from the source code (available `here <https://github.com/scikit-learn-contrib/forest-confidence-interval>`_ ), change your working directory to the top-level directory of the source code, and issue:
20 | 
21 | .. code-block:: bash
22 | 
23 |    python setup.py install
24 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
 1 | .. _general_examples:
 2 | 
 3 | Examples
 4 | =========
 5 | 
 6 | The examples use data from standard machine learning libraries to demonstrate
 7 | how `forestci` can be used to calculate error bars on
 8 | :class:`RandomForestRegressor` and :class:`RandomForestClassifier` objects. The
 9 | regression example uses a data-set from the `UC Irvine Machine Learning Repository <https://archive.ics.uci.edu/ml/datasets/Auto+MPG>`_ with features of
10 | different cars and their MPG. The classification example generates synthetic
11 | data to simulate a task like that of a spam filter: classifying items into one
12 | of two categories (e.g., spam/non-spam) based on a number of features.
13 | 


--------------------------------------------------------------------------------
/examples/plot_mpg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ======================================
 3 | Plotting Regression Forest Error Bars
 4 | ======================================
 5 | 
 6 | This example demonstrates using `forestci` to calculate the error bars of
 7 | the predictions of a :class:`sklearn.ensemble.RandomForestRegressor` object.
 8 | 
 9 | The data used here are a classical machine learning data-set, describing
10 | various features of different cars, and their MPG.
11 | """
12 | 
13 | # Regression Forest Example
14 | import numpy as np
15 | from matplotlib import pyplot as plt
16 | from sklearn.ensemble import RandomForestRegressor
17 | import sklearn.model_selection as xval
18 | from sklearn.datasets import fetch_openml
19 | import forestci as fci
20 | 
21 | # retreive mpg data from machine learning library
22 | mpg_data = fetch_openml(data_id=196)
23 | 
24 | # separate mpg data into predictors and outcome variable
25 | mpg_X = mpg_data["data"]
26 | mpg_y = mpg_data["target"]
27 | 
28 | # remove rows where the data is nan
29 | not_null_sel = np.where(mpg_X.isna().sum(axis=1).values == 0)
30 | mpg_X = mpg_X.values[not_null_sel]
31 | mpg_y = mpg_y.values[not_null_sel]
32 | 
33 | # split mpg data into training and test set
34 | mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
35 |     mpg_X,
36 |     mpg_y,
37 |     test_size=0.25,
38 |     random_state=42)
39 | 
40 | # Create RandomForestRegressor
41 | n_trees = 2000
42 | mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
43 | mpg_forest.fit(mpg_X_train, mpg_y_train)
44 | mpg_y_hat = mpg_forest.predict(mpg_X_test)
45 | 
46 | # Plot predicted MPG without error bars
47 | plt.scatter(mpg_y_test, mpg_y_hat)
48 | plt.plot([5, 45], [5, 45], 'k--')
49 | plt.xlabel('Reported MPG')
50 | plt.ylabel('Predicted MPG')
51 | plt.show()
52 | 
53 | # Calculate the variance
54 | mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train.shape,
55 |                                             mpg_X_test)
56 | 
57 | # Plot error bars for predicted MPG using unbiased variance
58 | plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
59 | plt.plot([5, 45], [5, 45], 'k--')
60 | plt.xlabel('Reported MPG')
61 | plt.ylabel('Predicted MPG')
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/examples/plot_mpg_svr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ======================================
 3 | Plotting Bagging Regression Error Bars
 4 | ======================================
 5 | 
 6 | This example demonstrates using `forestci` to calculate the error bars of
 7 | the predictions of a :class:`sklearn.ensemble.BaggingRegressor` object.
 8 | 
 9 | The data used here are a classical machine learning data-set, describing
10 | various features of different cars, and their MPG.
11 | """
12 | 
13 | # Regression Forest Example
14 | import numpy as np
15 | from matplotlib import pyplot as plt
16 | from sklearn.ensemble import BaggingRegressor
17 | from sklearn.svm import SVR
18 | import sklearn.model_selection as xval
19 | from sklearn.datasets import fetch_openml
20 | import forestci as fci
21 | 
22 | # retreive mpg data from machine learning library
23 | mpg_data = fetch_openml(data_id=196)
24 | 
25 | # separate mpg data into predictors and outcome variable
26 | mpg_X = mpg_data["data"]
27 | mpg_y = mpg_data["target"]
28 | 
29 | # remove rows where the data is nan
30 | not_null_sel = np.where(mpg_X.isna().sum(axis=1).values == 0)
31 | mpg_X = mpg_X.values[not_null_sel]
32 | mpg_y = mpg_y.values[not_null_sel]
33 | 
34 | # split mpg data into training and test set
35 | mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
36 |     mpg_X, mpg_y, test_size=0.25, random_state=42
37 | )
38 | 
39 | # Create RandomForestRegressor
40 | n_estimators = 1000
41 | mpg_bagger = BaggingRegressor(
42 |     estimator=SVR(), n_estimators=n_estimators, random_state=42
43 | )
44 | mpg_bagger.fit(mpg_X_train, mpg_y_train)
45 | mpg_y_hat = mpg_bagger.predict(mpg_X_test)
46 | 
47 | # Plot predicted MPG without error bars
48 | plt.scatter(mpg_y_test, mpg_y_hat)
49 | plt.plot([5, 45], [5, 45], "k--")
50 | plt.xlabel("Reported MPG")
51 | plt.ylabel("Predicted MPG")
52 | plt.show()
53 | 
54 | # Calculate the variance
55 | mpg_V_IJ_unbiased = fci.random_forest_error(mpg_bagger, mpg_X_train.shape, mpg_X_test)
56 | 
57 | # Plot error bars for predicted MPG using unbiased variance
58 | plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt="o")
59 | plt.plot([5, 45], [5, 45], "k--")
60 | plt.xlabel("Reported MPG")
61 | plt.ylabel("Predicted MPG")
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/examples/plot_spam.py:
--------------------------------------------------------------------------------
 1 | """
 2 | =========================================
 3 | Plotting Classification Forest Error Bars
 4 | =========================================
 5 | 
 6 | This example demonstrates the calculation of confidence intervals for
 7 | :class:`sklearn.ensemble.RandomForestClassifier` objects.
 8 | 
 9 | The data used here are synthetically generated to simulate a data-set in which
10 | email messages are labeled as spam based on 20 different features (the default
11 | of :func:`sklearn.datasets.make_classification`).
12 | """
13 | 
14 | import numpy as np
15 | from matplotlib import pyplot as plt
16 | from sklearn.model_selection import train_test_split
17 | from sklearn.ensemble import RandomForestClassifier
18 | import forestci as fci
19 | from sklearn.datasets import make_classification
20 | 
21 | spam_X, spam_y = make_classification(5000)
22 | 
23 | # split the datainto training and test set
24 | spam_X_train, spam_X_test, spam_y_train, spam_y_test = train_test_split(
25 |                                                        spam_X, spam_y,
26 |                                                        test_size=0.2)
27 | 
28 | # create RandomForestClassifier
29 | n_trees = 500
30 | spam_RFC = RandomForestClassifier(max_features=5, n_estimators=n_trees,
31 |                                   random_state=42)
32 | spam_RFC.fit(spam_X_train, spam_y_train)
33 | spam_y_hat = spam_RFC.predict_proba(spam_X_test)
34 | 
35 | idx_spam = np.where(spam_y_test == 1)[0]
36 | idx_ham = np.where(spam_y_test == 0)[0]
37 | 
38 | # Histogram predictions without error bars:
39 | fig, ax = plt.subplots(1)
40 | ax.hist(spam_y_hat[idx_spam, 1], histtype='step', label='spam')
41 | ax.hist(spam_y_hat[idx_ham, 1], histtype='step', label='not spam')
42 | ax.set_xlabel('Prediction (spam probability)')
43 | ax.set_ylabel('Number of observations')
44 | plt.legend()
45 | 
46 | # Calculate the variance
47 | spam_V_IJ_unbiased = fci.random_forest_error(spam_RFC, spam_X_train.shape,
48 |                                              spam_X_test)
49 | 
50 | # Plot forest prediction for emails and standard deviation for estimates
51 | # Blue points are spam emails; Green points are non-spam emails
52 | fig, ax = plt.subplots(1)
53 | ax.scatter(spam_y_hat[idx_spam, 1],
54 |            np.sqrt(spam_V_IJ_unbiased[idx_spam]),
55 |            label='spam')
56 | 
57 | ax.scatter(spam_y_hat[idx_ham, 1],
58 |            np.sqrt(spam_V_IJ_unbiased[idx_ham]),
59 |            label='not spam')
60 | 
61 | ax.set_xlabel('Prediction (spam probability)')
62 | ax.set_ylabel('Standard deviation')
63 | plt.legend()
64 | plt.show()
65 | 


--------------------------------------------------------------------------------
/forestci/__init__.py:
--------------------------------------------------------------------------------
1 | from .forestci import (calc_inbag, random_forest_error,
2 |                        _core_computation, _bias_correction)  # noqa
3 | 
4 | from .version import __version__  # noqa
5 | 
6 | __all__ = ("calc_inbag", "random_forest_error")
7 | 


--------------------------------------------------------------------------------
/forestci/calibration.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Calibration based on empirical Bayes estimation [Efron2014]_.
  3 | 
  4 | This calibration procedure can be useful when the number of trees in the
  5 | random forest is small.
  6 | 
  7 | """
  8 | import warnings
  9 | import numpy as np
 10 | from scipy.optimize import minimize
 11 | from scipy.signal import fftconvolve
 12 | from scipy.stats import norm
 13 | from .due import _due, _BibTeX
 14 | 
 15 | __all__ = ("gfit", "gbayes", "calibrateEB")
 16 | 
 17 | 
 18 | _due.cite(_BibTeX("""
 19 | @ARTICLE{Wager2014-wn,
 20 |   title       = "Two modeling strategies for empirical Bayes estimation.",
 21 |   author      = Efron, Bradley
 22 |   journal     = "Stat. Sci.",
 23 |   volume      =  29,
 24 |   number      =  2,
 25 |   pages       = "285--301",
 26 |   month       =  feb,
 27 |   year        =  2014,}"""),
 28 |           description=("Confidence Intervals for Random",
 29 |           " Forests: The Jackknife and the Infinitesimal",
 30 |                        "Jackknife"),
 31 |           path='forestci')
 32 | 
 33 | 
 34 | def gfit(X, sigma, p=2, nbin=1000, unif_fraction=0.1):
 35 |     """
 36 |     Fit empirical Bayes prior in the hierarchical model [Efron2014]_.
 37 | 
 38 |     .. math::
 39 | 
 40 |         mu ~ G, X ~ N(mu, sigma^2)
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     X: ndarray
 45 |         A 1D array of observations.
 46 |     sigma: float
 47 |         Noise estimate on X.
 48 |     p: int
 49 |         Number of parameters used to fit G.
 50 |     nbin: int
 51 |         Number of bins used for discrete approximation.
 52 |     unif_fraction: float
 53 |         Fraction of G modeled as "slab".
 54 | 
 55 |     Returns
 56 |     -------
 57 |     An array of the posterior density estimate g.
 58 |     """
 59 |     min_x = max(min(X) - 2 * np.std(X, ddof=1), 0)
 60 |     max_x = max(max(X) + 2 * np.std(X, ddof=1),
 61 |                 np.std(X, ddof=1))
 62 |     xvals = np.linspace(min_x, max_x, nbin)
 63 | 
 64 |     noise_kernel = norm(scale=sigma,loc=xvals.mean()).pdf(xvals)
 65 |     noise_kernel /= noise_kernel.sum()
 66 | 
 67 |     mask = xvals > 0
 68 |     assert sum(mask) > 0
 69 |     g_eta_slab = mask / sum(mask)
 70 | 
 71 |     XX = np.column_stack([ pow(xvals, exp) for exp in range(1, p+1)])
 72 |     XX /= np.sum(XX,axis = 0, keepdims=True) # normalize each feature column for better numerical stability
 73 | 
 74 |     def neg_loglik(eta):
 75 |         with np.errstate(over='ignore'):
 76 |             # if eta > 0 the exponential will likely get overflow. that is fine.
 77 |             g_eta_raw = np.exp(np.dot(XX, eta)) * mask
 78 | 
 79 |         if ((np.sum(g_eta_raw) == np.inf) |
 80 |             (np.sum(g_eta_raw) <=
 81 |                 100 * np.finfo(np.double).tiny)):
 82 |                 return (1000 * (len(X) + sum(eta ** 2)))
 83 | 
 84 |         assert sum(g_eta_raw) > 0, "Unexpected error"
 85 |         assert np.isfinite(sum(g_eta_raw)), "Unexpected error"
 86 |         g_eta_main = g_eta_raw / sum(g_eta_raw)
 87 |         g_eta = (
 88 |         (1 - unif_fraction) * g_eta_main +
 89 |              unif_fraction * g_eta_slab)
 90 |         f_eta = fftconvolve(g_eta, noise_kernel, mode='same')
 91 |         return np.sum(np.interp(X, xvals,
 92 |                       -np.log(np.maximum(f_eta, 0.0000001))))
 93 | 
 94 |     res = minimize(
 95 |         neg_loglik,
 96 |         np.full(p, -1, dtype='float'),
 97 |         tol=5e-5 # adjusted so that the MPG example in the docs passes
 98 |     )
 99 |     if not res.success:
100 |         warnings.warn("Fitting the empirical bayes prior failed with message %s." % res.message)
101 |     eta_hat = res.x
102 |     g_eta_raw = np.exp(np.dot(XX, eta_hat)) * mask
103 |     g_eta_main = g_eta_raw / sum(g_eta_raw)
104 |     g_eta = (
105 |         (1 - unif_fraction) * g_eta_main +
106 |              unif_fraction * g_eta_slab)
107 | 
108 |     assert np.all(np.isfinite(g_eta)), "Fitting the empirical bayes prior failed."
109 |     return xvals, g_eta
110 | 
111 | 
112 | def gbayes(x0, g_est, sigma):
113 |     """
114 |     Estimate Bayes posterior with Gaussian noise [Efron2014]_.
115 | 
116 |     Parameters
117 |     ----------
118 |     x0: ndarray
119 |         an observation
120 |     g_est: (ndarray,ndarray)
121 |         a prior density, as returned by gfit
122 |         g_est[0] is the x-positions
123 |         g_est[1] is the densities
124 |     sigma: int
125 |         noise estimate
126 | 
127 |     Returns
128 |     -------
129 |     An array of the posterior estimate E[mu | x0]
130 |     """
131 | 
132 |     Kx = norm().pdf((g_est[0] - x0) / sigma)
133 |     post = Kx * g_est[1]
134 |     post /= sum(post)
135 |     return sum(post * g_est[0])
136 | 
137 | 
138 | def calibrateEB(variances, sigma2):
139 |     """
140 |     Calibrate noisy variance estimates with empirical Bayes.
141 | 
142 |     Parameters
143 |     ----------
144 |     vars: ndarray
145 |         List of variance estimates.
146 |     sigma2: int
147 |         Estimate of the Monte Carlo noise in vars.
148 | 
149 |     Returns
150 |     -------
151 |     An array of the calibrated variance estimates
152 |     """
153 |     if (sigma2 <= 0 or min(variances) == max(variances)):
154 |         return(np.maximum(variances, 0))
155 | 
156 |     sigma = np.sqrt(sigma2)
157 |     eb_prior = gfit(variances, sigma)
158 | 
159 |     if len(variances) >= 200:
160 |         # Interpolate to speed up computations:
161 |         calib_x = np.percentile(variances,
162 |                                 np.arange(0, 102, 2))
163 |         calib_y = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in calib_x]
164 |         calib_all = np.interp(variances, calib_x, calib_y)
165 |     else:
166 |         calib_all = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in variances]
167 | 
168 |     return np.asarray(calib_all)
169 | 


--------------------------------------------------------------------------------
/forestci/due.py:
--------------------------------------------------------------------------------
 1 | # emacs: at the end of the file
 2 | # ex: set sts=4 ts=4 sw=4 et:
 3 | # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### #
 4 | """
 5 | 
 6 | Due-credit
 7 | ==========
 8 | 
 9 | `duecredit <http://duecredit.org>`_  is a framework conceived to address the
10 | problem of inadequate citation of scientific software and methods. It automates
11 | the insertion of citations into code. We use it here to refer to the original
12 | publication introducing the method we have implemented.
13 | 
14 | See  https://github.com/duecredit/duecredit/blob/master/README.md for examples.
15 | 
16 | Origin:     Originally a part of the duecredit software package
17 | 
18 | Copyright:  2015-2016  DueCredit developers
19 | 
20 | License:    BSD-2
21 | """
22 | 
23 | __version__ = '0.0.5'
24 | 
25 | 
26 | class _InactiveDueCreditCollector(object):
27 |     """Just a stub at the Collector which would not do anything"""
28 |     def _donothing(self, *args, **kwargs):
29 |         """Perform no good and no bad"""
30 |         pass
31 | 
32 |     def dcite(self, *args, **kwargs):
33 |         """If I could cite I would"""
34 |         def nondecorating_decorator(func):
35 |             return func
36 |         return nondecorating_decorator
37 | 
38 |     cite = load = add = _donothing
39 | 
40 |     def __repr__(self):
41 |         return self.__class__.__name__ + '()'
42 | 
43 | 
44 | def _donothing_func(*args, **kwargs):
45 |     """Perform no good and no bad"""
46 |     pass
47 | 
48 | try:
49 |     from duecredit import due as _due
50 |     from duecredit import BibTeX as _BibTeX
51 |     from duecredit import Doi as _Doi
52 |     from duecredit import Url as _Url
53 |     if '_due' in locals() and not hasattr(_due, 'cite'):
54 |         raise RuntimeError(
55 |             "Imported due lacks .cite. DueCredit is now disabled")
56 | except Exception as e:
57 |     if type(e).__name__ != 'ImportError':
58 |         import logging
59 |         logging.getLogger("duecredit").error(
60 |             "Failed to import duecredit due to %s" % str(e))
61 |     # Initiate due stub
62 |     _due = _InactiveDueCreditCollector()
63 |     _BibTeX = _Doi = _Url = _donothing_func
64 | 
65 | # Emacs mode definitions
66 | # Local Variables:
67 | # mode: python
68 | # py-indent-offset: 4
69 | # tab-width: 4
70 | # indent-tabs-mode: nil
71 | # End:
72 | 


--------------------------------------------------------------------------------
/forestci/forestci.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Forest confidence intervals.
  3 | 
  4 | Calculate confidence intervals for scikit-learn RandomForestRegressor and
  5 | RandomForestClassifier predictions.
  6 | """
  7 | 
  8 | import numpy as np
  9 | import copy
 10 | 
 11 | from sklearn.ensemble._forest import BaseForest
 12 | from sklearn.ensemble._forest import (_generate_sample_indices,
 13 |                                       _get_n_samples_bootstrap)
 14 | from sklearn.ensemble._bagging import BaseBagging
 15 | 
 16 | from .calibration import calibrateEB
 17 | from .due import _due, _BibTeX
 18 | 
 19 | __all__ = ("calc_inbag", "random_forest_error", "_bias_correction",
 20 |            "_core_computation")
 21 | 
 22 | _due.cite(
 23 |     _BibTeX(
 24 |         """
 25 | @ARTICLE{Wager2014-wn,
 26 |   title       = "Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife",
 27 |   author      = "Wager, Stefan and Hastie, Trevor and Efron, Bradley",
 28 |   journal     = "J. Mach. Learn. Res.",
 29 |   volume      =  15,
 30 |   number      =  1,
 31 |   pages       = "1625--1651",
 32 |   month       =  jan,
 33 |   year        =  2014,}"""
 34 |     ),
 35 |     description=(
 36 |         "Confidence Intervals for Random Forests:",
 37 |         "The Jackknife and the Infinitesimal Jackknife",
 38 |     ),
 39 |     path="forestci",
 40 | )
 41 | 
 42 | 
 43 | def calc_inbag(n_samples, forest):
 44 |     """
 45 |     Derive samples used to create trees in scikit-learn RandomForest objects.
 46 | 
 47 |     Recovers the samples in each tree from the random state of that tree using
 48 |     :func:`forest._generate_sample_indices`.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     n_samples : int
 53 |         The number of samples used to fit the scikit-learn RandomForest object.
 54 | 
 55 |     forest : RandomForest
 56 |         Regressor or Classifier object that is already fit by scikit-learn.
 57 | 
 58 |     Returns
 59 |     -------
 60 |     Array that records how many times a data point was placed in a tree.
 61 |     Columns are individual trees. Rows are the number of times a sample was
 62 |     used in a tree.
 63 |     """
 64 | 
 65 |     if not forest.bootstrap:
 66 |         e_s = "Cannot calculate the inbag from a forest that has bootstrap=False"
 67 |         raise ValueError(e_s)
 68 | 
 69 |     n_trees = forest.n_estimators
 70 |     inbag = np.zeros((n_samples, n_trees))
 71 |     sample_idx = []
 72 |     if isinstance(forest, BaseForest):
 73 |         n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, forest.max_samples)
 74 | 
 75 |         for t_idx in range(n_trees):
 76 |             sample_idx.append(
 77 |                 _generate_sample_indices(
 78 |                     forest.estimators_[t_idx].random_state,
 79 |                     n_samples,
 80 |                     n_samples_bootstrap,
 81 |                 )
 82 |             )
 83 |             inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
 84 |     elif isinstance(forest, BaseBagging):
 85 |         for t_idx, estimator_sample in enumerate(forest.estimators_samples_):
 86 |             sample_idx.append(estimator_sample)
 87 |             inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
 88 | 
 89 |     return inbag
 90 | 
 91 | 
 92 | def _core_computation(
 93 |     X_train_shape,
 94 |     X_test,
 95 |     inbag,
 96 |     pred_centered,
 97 |     n_trees,
 98 |     memory_constrained=False,
 99 |     memory_limit=None,
100 |     test_mode=False,
101 | ):
102 |     """
103 |     Helper function, that performs the core computation
104 | 
105 |     Parameters
106 |     ----------
107 |     X_train_shape : tuple (int, int)
108 |         Shape (n_train_sample, n_features).
109 | 
110 |     X_test : ndarray
111 |         An array with shape (n_test_sample, n_features).
112 | 
113 |     inbag : ndarray
114 |         The inbag matrix that fit the data. If set to `None` (default) it
115 |         will be inferred from the forest. However, this only works for trees
116 |         for which bootstrapping was set to `True`. That is, if sampling was
117 |         done with replacement. Otherwise, users need to provide their own
118 |         inbag matrix.
119 | 
120 |     pred_centered : ndarray
121 |         Centered predictions that are an intermediate result in the
122 |         computation.
123 | 
124 |     memory_constrained: boolean (optional)
125 |         Whether or not there is a restriction on memory. If False, it is
126 |         assumed that a ndarry of shape (n_train_sample,n_test_sample) fits
127 |         in main memory. Setting to True can actually provide a speed up if
128 |         memory_limit is tuned to the optimal range.
129 | 
130 |     memory_limit: int (optional)
131 |         An upper bound for how much memory the itermediate matrices will take
132 |         up in Megabytes. This must be provided if memory_constrained=True.
133 | 
134 | 
135 |     """
136 |     if not memory_constrained:
137 |         return np.sum((np.dot(inbag - 1, pred_centered.T) / n_trees) ** 2, 0)
138 | 
139 |     if not memory_limit:
140 |         raise ValueError("If memory_constrained=True, must provide", "memory_limit.")
141 | 
142 |     # Assumes double precision float
143 |     chunk_size = int((memory_limit * 1e6) / (8.0 * X_train_shape[0]))
144 | 
145 |     if chunk_size == 0:
146 |         min_limit = 8.0 * X_train_shape[0] / 1e6
147 |         raise ValueError(
148 |             "memory_limit provided is too small."
149 |             + "For these dimensions, memory_limit must "
150 |             + "be greater than or equal to %.3e" % min_limit
151 |         )
152 | 
153 |     chunk_edges = np.arange(0, X_test.shape[0] + chunk_size, chunk_size)
154 |     inds = range(X_test.shape[0])
155 |     chunks = [
156 |         inds[chunk_edges[i] : chunk_edges[i + 1]] for i in range(len(chunk_edges) - 1)
157 |     ]
158 |     if test_mode:
159 |         print("Number of chunks: %d" % (len(chunks),))
160 |     V_IJ = np.concatenate(
161 |         [
162 |             np.sum((np.dot(inbag - 1, pred_centered[chunk].T) / n_trees) ** 2, 0)
163 |             for chunk in chunks
164 |         ]
165 |     )
166 |     return V_IJ
167 | 
168 | 
169 | def _bias_correction(V_IJ, inbag, pred_centered, n_trees):
170 |     """
171 |     Helper functions that implements bias correction
172 | 
173 |     Parameters
174 |     ----------
175 |     V_IJ : ndarray
176 |         Intermediate result in the computation.
177 | 
178 |     inbag : ndarray
179 |         The inbag matrix that fit the data. If set to `None` (default) it
180 |         will be inferred from the forest. However, this only works for trees
181 |         for which bootstrapping was set to `True`. That is, if sampling was
182 |         done with replacement. Otherwise, users need to provide their own
183 |         inbag matrix.
184 | 
185 |     pred_centered : ndarray
186 |         Centered predictions that are an intermediate result in the
187 |         computation.
188 | 
189 |     n_trees : int
190 |         The number of trees in the forest object.
191 |     """
192 |     n_train_samples = inbag.shape[0]
193 |     n_var = np.mean(
194 |         np.square(inbag[0:n_trees]).mean(axis=1).T.view()
195 |         - np.square(inbag[0:n_trees].mean(axis=1)).T.view()
196 |     )
197 |     boot_var = np.square(pred_centered).sum(axis=1) / n_trees
198 |     bias_correction = n_train_samples * n_var * boot_var / n_trees
199 |     V_IJ_unbiased = V_IJ - bias_correction
200 |     return V_IJ_unbiased
201 | 
202 | 
203 | def _centered_prediction_forest(forest, X_test, y_output=None):
204 |     """
205 |     Center the tree predictions by the mean prediction (forest)
206 | 
207 |     The centering is done for all provided test samples.
208 |     This function allows unit testing for internal correctness.
209 | 
210 |     Parameters
211 |     ----------
212 |     forest : RandomForest
213 |         Regressor or Classifier object.
214 | 
215 |     X_test : ndarray
216 |         An array with shape (n_test_sample, n_features). The design matrix
217 |         for testing data
218 | 
219 |     Returns
220 |     -------
221 |     pred_centered : ndarray
222 |         An array with shape (n_test_sample, n_estimators).
223 |         The predictions of each single tree centered by the
224 |         mean prediction (i.e. the prediction of the forest)
225 | 
226 |     """
227 |     # In case the user provided a (n_features)-shaped array for a single sample
228 |     #  shape it as (1, n_features)
229 |     # NOTE: a single-feature set of samples needs to be provided with shape
230 |     #       (n_samples, 1) or it will be wrongly interpreted!
231 |     if len(X_test.shape) == 1:
232 |         X_test = X_test.reshape(1, -1)
233 | 
234 |     pred = np.array([tree.predict(X_test) for tree in forest])
235 |     if 'n_outputs_' in dir(forest) and forest.n_outputs_ > 1:
236 |         pred = pred[:,:,y_output]
237 | 
238 |     pred_mean = np.mean(pred, 0)
239 | 
240 |     return (pred - pred_mean).T
241 | 
242 | 
243 | def random_forest_error(
244 |     forest,
245 |     X_train_shape,
246 |     X_test,
247 |     inbag=None,
248 |     calibrate=True,
249 |     memory_constrained=False,
250 |     memory_limit=None,
251 |     y_output=None
252 | ):
253 |     """
254 |     Calculate error bars from scikit-learn RandomForest estimators.
255 | 
256 |     RandomForest is a regressor or classifier object
257 |     this variance can be used to plot error bars for RandomForest objects
258 | 
259 |     Parameters
260 |     ----------
261 |     forest : RandomForest
262 |         Regressor or Classifier object.
263 | 
264 |     X_train_shape : tuple (int, int)
265 |         Shape (n_train_sample, n_features) of the design matrix for training data.
266 | 
267 |     X_test : ndarray
268 |         An array with shape (n_test_sample, n_features). The design matrix
269 |         for testing data
270 | 
271 |     inbag : ndarray, optional
272 |         The inbag matrix that fit the data. If set to `None` (default) it
273 |         will be inferred from the forest. However, this only works for trees
274 |         for which bootstrapping was set to `True`. That is, if sampling was
275 |         done with replacement. Otherwise, users need to provide their own
276 |         inbag matrix.
277 | 
278 |     calibrate: boolean, optional
279 |         Whether to apply calibration to mitigate Monte Carlo noise.
280 |         Some variance estimates may be negative due to Monte Carlo effects if
281 |         the number of trees in the forest is too small. To use calibration,
282 |         Default: True
283 | 
284 |     memory_constrained: boolean, optional
285 |         Whether or not there is a restriction on memory. If False, it is
286 |         assumed that a ndarry of shape (n_train_sample,n_test_sample) fits
287 |         in main memory. Setting to True can actually provide a speed up if
288 |         memory_limit is tuned to the optimal range.
289 | 
290 |     memory_limit: int, optional.
291 |         An upper bound for how much memory the itermediate matrices will take
292 |         up in Megabytes. This must be provided if memory_constrained=True.
293 | 
294 |     y_output: int, mandatory only for MultiOutput regressor.
295 |         In case of MultiOutput regressor, indicate the index of the target to
296 |         analyse. The program will return the IJ variance related to that target
297 |         only.
298 | 
299 |     Returns
300 |     -------
301 |     An array with the unbiased sampling variance (V_IJ_unbiased)
302 |     for a RandomForest object.
303 | 
304 |     See Also
305 |     ----------
306 |     :func:`calc_inbag`
307 | 
308 |     Notes
309 |     -----
310 |     The calculation of error is based on the infinitesimal jackknife variance,
311 |     as described in [Wager2014]_ and is a Python implementation of the R code
312 |     provided at: https://github.com/swager/randomForestCI
313 | 
314 |     .. [Wager2014] S. Wager, T. Hastie, B. Efron. "Confidence Intervals for
315 |        Random Forests: The Jackknife and the Infinitesimal Jackknife", Journal
316 |        of Machine Learning Research vol. 15, pp. 1625-1651, 2014.
317 |     """
318 | 
319 |     if 'n_outputs_' in dir(forest) and forest.n_outputs_ > 1 and y_output == None:
320 |         e_s = "MultiOutput regressor: specify the index of the target to analyse (y_output)"
321 |         raise ValueError(e_s)
322 | 
323 |     if inbag is None:
324 |         inbag = calc_inbag(X_train_shape[0], forest)
325 | 
326 |     pred_centered = _centered_prediction_forest(forest, X_test, y_output)
327 |     n_trees = forest.n_estimators
328 |     V_IJ = _core_computation(
329 |         X_train_shape, X_test, inbag, pred_centered, n_trees, memory_constrained, memory_limit
330 |     )
331 |     V_IJ_unbiased = _bias_correction(V_IJ, inbag, pred_centered, n_trees)
332 | 
333 |     # Correct for cases where resampling is done without replacement:
334 |     if np.max(inbag) == 1:
335 |         variance_inflation = 1 / (1 - np.mean(inbag)) ** 2
336 |         V_IJ_unbiased *= variance_inflation
337 | 
338 |     if not calibrate:
339 |         return V_IJ_unbiased
340 | 
341 |     if V_IJ_unbiased.shape[0] <= 20:
342 |         print("No calibration with n_samples <= 20: ", 
343 |                  "consider using more n_estimators in your model, ", 
344 |                  "for more accurate ci and to avoid negative values.")
345 |         return V_IJ_unbiased
346 |     if calibrate:
347 |         # Calibration is a correction for converging quicker to the case of infinite n_estimators,
348 |         # as presented in Wager (2014) http://jmlr.org/papers/v15/wager14a.html
349 |         calibration_ratio = 2
350 |         n_sample = np.ceil(n_trees / calibration_ratio)
351 |         new_forest = copy.deepcopy(forest)
352 |         random_idx = np.random.permutation(len(new_forest.estimators_))[: int(n_sample)]
353 |         new_forest.estimators_ = list(np.array(new_forest.estimators_)[random_idx])
354 |         if hasattr(new_forest, "_seeds"):
355 |             new_forest._seeds = new_forest._seeds[random_idx]
356 | 
357 |         new_forest.n_estimators = int(n_sample)
358 | 
359 |         results_ss = random_forest_error(
360 |             new_forest,
361 |             X_train_shape,
362 |             X_test,
363 |             calibrate=False,
364 |             memory_constrained=memory_constrained,
365 |             memory_limit=memory_limit,
366 |             y_output=y_output
367 |         )
368 |         # Use this second set of variance estimates
369 |         # to estimate scale of Monte Carlo noise
370 |         sigma2_ss = np.mean((results_ss - V_IJ_unbiased) ** 2)
371 |         delta = n_sample / n_trees
372 |         sigma2 = (delta ** 2 + (1 - delta) ** 2) / (2 * (1 - delta) ** 2) * sigma2_ss
373 | 
374 |         # Use Monte Carlo noise scale estimate for empirical Bayes calibration
375 |         V_IJ_calibrated = calibrateEB(V_IJ_unbiased, sigma2)
376 | 
377 |         return V_IJ_calibrated
378 | 


--------------------------------------------------------------------------------
/forestci/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/forestci/tests/__init__.py


--------------------------------------------------------------------------------
/forestci/tests/test_forestci.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy.testing as npt
  3 | from sklearn.ensemble import RandomForestRegressor
  4 | from sklearn.ensemble import BaggingRegressor
  5 | from sklearn.svm import SVR
  6 | import forestci as fci
  7 | 
  8 | 
  9 | def test_random_forest_error():
 10 |     X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])
 11 | 
 12 |     y = np.array([70, 100, 60, 100, 120])
 13 | 
 14 |     train_idx = [2, 3, 4]
 15 |     test_idx = [0, 1]
 16 | 
 17 |     y_test = y[test_idx]
 18 |     y_train = y[train_idx]
 19 |     X_test = X[test_idx]
 20 |     X_train = X[train_idx]
 21 | 
 22 |     n_trees = 4
 23 |     forest = RandomForestRegressor(n_estimators=n_trees)
 24 |     forest.fit(X_train, y_train)
 25 |     inbag = fci.calc_inbag(X_train.shape[0], forest)
 26 |     for ib in [inbag, None]:
 27 |         for calibrate in [True, False]:
 28 |             V_IJ_unbiased = fci.random_forest_error(
 29 |                 forest, X_train.shape, X_test, inbag=ib, calibrate=calibrate
 30 |             )
 31 |         npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
 32 | 
 33 |     # We cannot calculate inbag from a non-bootstrapped forest. This is because
 34 |     # Scikit-learn trees do not store their own sample weights. If you did This
 35 |     # some other way, you can still use your own inbag
 36 |     non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees, bootstrap=False)
 37 | 
 38 |     npt.assert_raises(
 39 |         ValueError, fci.calc_inbag, X_train.shape[0], non_bootstrap_forest
 40 |     )
 41 | 
 42 | 
 43 | def test_random_forest_error_multioutput():
 44 |     X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])
 45 | 
 46 |     y = np.array([[70, 37], [100, 55], [60, 33], [100,54], [120, 66]])
 47 | 
 48 |     train_idx = [2, 3, 4]
 49 |     test_idx = [0, 1]
 50 | 
 51 |     y_test = y[test_idx]
 52 |     y_train = y[train_idx]
 53 |     X_test = X[test_idx]
 54 |     X_train = X[train_idx]
 55 | 
 56 |     n_trees = 4
 57 |     forest = RandomForestRegressor(n_estimators=n_trees)
 58 |     forest.fit(X_train, y_train)
 59 |     
 60 |     V_IJ_unbiased_target0 = fci.random_forest_error(
 61 |         forest, X_train.shape, X_test, calibrate=True, y_output=0
 62 |     )
 63 |     npt.assert_equal(V_IJ_unbiased_target0.shape[0], y_test.shape[0])
 64 | 
 65 |     # With a MultiOutput RandomForestRegressor the user MUST specify a y_output
 66 |     npt.assert_raises(
 67 |         ValueError, 
 68 |         fci.random_forest_error, 
 69 |         forest,
 70 |         X_train.shape,
 71 |         X_test,
 72 |         inbag=None,
 73 |         calibrate=True,
 74 |         memory_constrained=False,
 75 |         memory_limit=None,
 76 |         y_output=None # This should trigger the ValueError
 77 |     )
 78 | 
 79 | 
 80 | def test_bagging_svr_error():
 81 |     X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])
 82 | 
 83 |     y = np.array([70, 100, 60, 100, 120])
 84 | 
 85 |     train_idx = [2, 3, 4]
 86 |     test_idx = [0, 1]
 87 | 
 88 |     y_test = y[test_idx]
 89 |     y_train = y[train_idx]
 90 |     X_test = X[test_idx]
 91 |     X_train = X[train_idx]
 92 | 
 93 |     n_trees = 4
 94 |     bagger = BaggingRegressor(estimator=SVR(), n_estimators=n_trees)
 95 |     bagger.fit(X_train, y_train)
 96 |     inbag = fci.calc_inbag(X_train.shape[0], bagger)
 97 |     for ib in [inbag, None]:
 98 |         for calibrate in [True, False]:
 99 |             V_IJ_unbiased = fci.random_forest_error(
100 |                 bagger, X_train.shape, X_test, inbag=ib, calibrate=calibrate
101 |             )
102 |         npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
103 | 
104 | 
105 | def test_core_computation():
106 |     inbag_ex = np.array(
107 |         [[1.0, 2.0, 0.0, 1.0], [1.0, 0.0, 2.0, 0.0], [1.0, 1.0, 1.0, 2.0]]
108 |     )
109 | 
110 |     X_train_ex = np.array([[3, 3], [6, 4], [6, 6]])
111 |     X_test_ex = np.vstack([np.array([[5, 2], [5, 5]]) for _ in range(1000)])
112 |     pred_centered_ex = np.vstack(
113 |         [np.array([[-20, -20, 10, 30], [-20, 30, -20, 10]]) for _ in range(1000)]
114 |     )
115 |     n_trees = 4
116 | 
117 |     our_vij = fci._core_computation(
118 |         X_train_ex.shape, X_test_ex, inbag_ex, pred_centered_ex, n_trees
119 |     )
120 | 
121 |     r_vij = np.concatenate([np.array([112.5, 387.5]) for _ in range(1000)])
122 | 
123 |     npt.assert_almost_equal(our_vij, r_vij)
124 | 
125 |     for mc, ml in zip([True, False], [0.01, None]):
126 |         our_vij = fci._core_computation(
127 |             X_train_ex.shape,
128 |             X_test_ex,
129 |             inbag_ex,
130 |             pred_centered_ex,
131 |             n_trees,
132 |             memory_constrained=True,
133 |             memory_limit=0.01,
134 |             test_mode=True,
135 |         )
136 | 
137 |         npt.assert_almost_equal(our_vij, r_vij)
138 | 
139 | 
140 | def test_bias_correction():
141 |     inbag_ex = np.array(
142 |         [[1.0, 2.0, 0.0, 1.0], [1.0, 0.0, 2.0, 0.0], [1.0, 1.0, 1.0, 2.0]]
143 |     )
144 | 
145 |     X_train_ex = np.array([[3, 3], [6, 4], [6, 6]])
146 | 
147 |     X_test_ex = np.array([[5, 2], [5, 5]])
148 | 
149 |     pred_centered_ex = np.array([[-20, -20, 10, 30], [-20, 30, -20, 10]])
150 |     n_trees = 4
151 | 
152 |     our_vij = fci._core_computation(
153 |         X_train_ex.shape, X_test_ex, inbag_ex, pred_centered_ex, n_trees
154 |     )
155 |     our_vij_unbiased = fci._bias_correction(
156 |         our_vij, inbag_ex, pred_centered_ex, n_trees
157 |     )
158 |     r_unbiased_vij = np.array([-42.1875, 232.8125])
159 |     npt.assert_almost_equal(our_vij_unbiased, r_unbiased_vij)
160 | 
161 | 
162 | def test_with_calibration():
163 |     # Test both with and without interpolation:
164 |     for n in [25 * 5, 205 * 5]:
165 |         X = np.random.rand(n).reshape(n // 5, 5)
166 |         y = np.random.rand(n // 5)
167 | 
168 |         train_idx = np.arange(int(n // 5 * 0.75))
169 |         test_idx = np.arange(int(n // 5 * 0.75), n // 5)
170 | 
171 |         y_test = y[test_idx]
172 |         y_train = y[train_idx]
173 |         X_test = X[test_idx]
174 |         X_train = X[train_idx]
175 | 
176 |         n_trees = 4
177 |         forest = RandomForestRegressor(n_estimators=n_trees)
178 |         forest.fit(X_train, y_train)
179 |         V_IJ_unbiased = fci.random_forest_error(forest, X_train.shape, X_test)
180 |         npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
181 | 
182 | 
183 | def test_centered_prediction_forest():
184 |     X = np.array([[5, 2],
185 |                   [5, 5],
186 |                   [3, 3],
187 |                   [6, 4],
188 |                   [6, 6]])
189 | 
190 |     y = np.array([70, 100, 60, 100, 120])
191 | 
192 |     train_idx = [2, 3, 4]
193 |     test_idx = [0, 1]
194 | 
195 |     y_test = y[test_idx]
196 |     y_train = y[train_idx]
197 |     X_test = X[test_idx]
198 |     X_train = X[train_idx]
199 | 
200 |     n_trees = 8
201 |     forest = RandomForestRegressor(n_estimators=n_trees)
202 |     forest = forest.fit(X_train, y_train)
203 | 
204 |     # test different amount of test samples
205 |     for i in range(len(X_test)):
206 |         test_samples = X_test[:i+1]
207 |         pred_centered = fci.forestci._centered_prediction_forest(forest, test_samples)
208 | 
209 |         # the vectorized solution has to match the single sample predictions
210 |         for n_sample, sample in enumerate(test_samples):
211 |             # the following assignment assures correctness of single test sample calculations
212 |             # no additional tests for correct averaging required since for single test samples
213 |             # dimension 0 (i.e. the number of test sets) disappears
214 |             pred_centered_sample = fci.forestci._centered_prediction_forest(
215 |                 forest, sample)
216 |             assert len(pred_centered_sample[0]) == n_trees
217 |             npt.assert_almost_equal(
218 |                 pred_centered_sample[0],
219 |                 pred_centered[n_sample]
220 |                 )
221 | 


--------------------------------------------------------------------------------
/forestci/version.py:
--------------------------------------------------------------------------------
 1 | # Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z"
 2 | _version_major = 0
 3 | _version_minor = 7
 4 | _version_micro = ''  # use '' for first of series, number for 1 and above
 5 | _version_extra = ''
 6 | #  _version_extra = ''  # Uncomment this for full releases
 7 | 
 8 | # Construct full version string from these.
 9 | _ver = [_version_major, _version_minor]
10 | if _version_micro:
11 |     _ver.append(_version_micro)
12 | if _version_extra:
13 |     _ver.append(_version_extra)
14 | 
15 | __version__ = '.'.join(map(str, _ver))
16 | 
17 | CLASSIFIERS = ["Development Status :: 3 - Alpha",
18 |                "Environment :: Console",
19 |                "Intended Audience :: Science/Research",
20 |                "License :: OSI Approved :: MIT License",
21 |                "Operating System :: OS Independent",
22 |                "Programming Language :: Python",
23 |                "Topic :: Scientific/Engineering"]
24 | 
25 | # Description should be a one-liner:
26 | description = "forestci: confidence intervals for scikit-learn "
27 | description += "forest algorithms"
28 | # Long description will go up on the pypi page
29 | long_description = """
30 | 
31 | sklearn forest ci
32 | =================
33 | 
34 | `forest-confidence-interval` is a Python module for calculating variance and
35 | adding confidence intervals to scikit-learn random forest regression or
36 | classification objects. The core functions calculate an in-bag and error bars
37 | for random forest objects
38 | 
39 | Please read the repository README_ on Github or our documentation_
40 | 
41 | .. _README: https://github.com/scikit-learn-contrib/forest-confidence-interval/blob/master/README.md
42 | 
43 | .. _documentation: http://contrib.scikit-learn.org/forest-confidence-interval/
44 | 
45 | """
46 | 
47 | NAME = "forestci"
48 | MAINTAINER = "Ariel Rokem"
49 | MAINTAINER_EMAIL = "arokem@uw.edu"
50 | DESCRIPTION = description
51 | LONG_DESCRIPTION = long_description
52 | URL = "http://github.com/scikit-learn-contrib/forest-confidence-interval"
53 | DOWNLOAD_URL = ""
54 | LICENSE = "MIT"
55 | AUTHOR = "Ariel Rokem, Bryna Hazelton, Kivan Polimis"
56 | AUTHOR_EMAIL = "arokem@uw.edu"
57 | PLATFORMS = "OS Independent"
58 | MAJOR = _version_major
59 | MINOR = _version_minor
60 | MICRO = _version_micro
61 | VERSION = __version__
62 | 


--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
 1 | 
 2 | @article{wager_confidence_2014,
 3 | 	title = {Confidence {Intervals} for {Random} {Forests}: {The} {Jackknife} and the {Infinitesimal} {Jackknife}},
 4 | 	volume = {15},
 5 | 	issn = {1532-4435},
 6 | 	url = {http://dl.acm.org/citation.cfm?id=2627435.2638587},
 7 | 	number = {1},
 8 | 	journal = {Journal of Machine Learning Research},
 9 | 	author = {Wager, Stefan and Hastie, Trevor and Efron, Bradley},
10 | 	month = jan,
11 | 	year = {2014},
12 | 	keywords = {bagging, jackknife methods, Monte Carlo noise, variance estimation},
13 | 	pages = {1625--1651}
14 | }
15 | 
16 | @misc{wager_randomforestci_2016,
17 | 	title = {{randomForestCI}},
18 | 	url = {https://github.com/swager/randomForestCI},
19 | 	abstract = {randomForestCI},
20 | 	urldate = {2016-09-23},
21 | 	author = {Wager, Stefan},
22 | 	month = sep,
23 | 	year = {2016}
24 | }
25 | 
26 | @inproceedings{quinlan_combining_1993,
27 | 	address = {San Francisco, CA, USA},
28 | 	series = {{ICML}'93},
29 | 	title = {Combining {Instance}-based and {Model}-based {Learning}},
30 | 	isbn = {1-55860-307-7},
31 | 	url = {http://dl.acm.org/citation.cfm?id=3091529.3091560},
32 | 	booktitle = {Proceedings of the {Tenth} {International} {Conference} on {International} {Conference} on {Machine} {Learning}},
33 | 	publisher = {Morgan Kaufmann Publishers Inc.},
34 | 	author = {Quinlan, J. Ross},
35 | 	year = {1993},
36 | 	pages = {236--243}
37 | }


--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Confidence Intervals for Random Forests in Python'
 3 | tags:
 4 |   - Python
 5 |   - scikit-learn
 6 |   - random forest
 7 |   - confidence intervals
 8 | authors:
 9 | - name: Kivan Polimis
10 |   orcid: 0000-0002-3498-0479
11 |   affiliation: 1
12 | - name: Ariel Rokem
13 |   orcid: 0000-0003-0679-1985
14 |   affiliation: 1
15 | - name: Bryna Hazelton
16 |   orcid: 0000-0001-7532-645X
17 |   affiliation: 1
18 | affiliations:
19 |  - name: eScience Institute, University of Washington
20 |    index: 1
21 | date: 7 July 2017
22 | bibliography: paper.bib
23 | ---
24 | 
25 | # Summary
26 | Random forests are a method for predicting numerous ensemble learning tasks. Prediction variability can illustrate how influential the training set is for producing the observed random forest predictions and provides additional information about prediction accuracy. `forest-confidence-interval` is a Python module for calculating variance and adding confidence intervals to `scikit-learn` random forest regression or classification objects. The core functions calculate an in-bag and error bars for random forest objects. Our software is designed for individuals using `scikit-learn` random forest objects that want to add estimates of uncertainty to random forest predictors. This module is an implementation of an algorithm developed by @wager_confidence_2014 and previously implemented in R [@wager_randomforestci_2016].
27 | 
28 | # Usage
29 | Our package's `random_forest_error` and `calc_inbag` functions use the random forest object (including training and test data) to create variance estimates that can be plotted (e.g. as confidence intervals or standard deviations). The in-bag matrix that fit the data is set to `None` by default, and the in-bag will be inferred from the forest. However, this only works for trees for which bootstrapping was set to `True`. That is, if sampling was done with replacement. Otherwise, users need to provide their own in-bag matrix.
30 | 
31 | # Examples gallery
32 | The regression example uses a slightly modified data-set from the Carnegie Mellon University's StatLib library (available from the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Auto+MPG)) with features of different cars and their MPG [@quinlan_combining_1993]. The classification example generates synthetic data to simulate a task like that of a spam filter: classifying items into one of two categories (e.g., spam/non-spam) based on a number of features. This module will work for matrices or `pandas` data frames. Then, `scikit-learn` functions split the example data into training and test data and generate a random forest object (regression or classifier). The examples calculate variance from random forest objects that use the highest mean probability estimate across the trees. The focus on means for estimates and unit comparability between sample mean and dispersion measures is the basis for plotting with the square root of the variance (standard deviation). As the plots with variance estimated show, some predictions have more error than others. For instance, in the regression (MPG) example, predictions of higher mileage MPG are associated with greater variance than lower mileage predictions.
33 | 
34 | ## Regression example
35 | 
36 | ### No variance estimated
37 | -![plot-mpg-no-variance](plot_mpg_no_variance.png)
38 | 
39 | ### Plot with variance
40 | -![plot-mpg-variance](plot_mpg.png)
41 | 
42 | ## Classification example
43 | ### No variance estimated
44 | -![plot-spam-no-variance](plot_spam_no_variance.png)
45 | 
46 | ### Plot with variance
47 | -![plot-spam](plot_spam.png)
48 | 
49 | 
50 | ## Community guidelines
51 | 
52 | Contributions are very welcome, but we ask that contributors abide by the [contributor covenant](http://contributor-covenant.org/version/1/4/).
53 | 
54 | To report issues with the software, please post to the
55 | [issue log](https://github.com/scikit-learn-contrib/forest-confidence-interval/issues)
56 | Bug reports are also appreciated, please add them to the issue log after
57 | verifying that the issue does not already exist.
58 | Comments on existing issues are also welcome.
59 | 
60 | Please submit improvements as pull requests against the repo after verifying
61 | that the existing tests pass and any new code is well covered by unit tests.
62 | Please write code that complies with the Python style guide,
63 | [PEP8](https://www.python.org/dev/peps/pep-0008/)
64 | 
65 | Please e-mail [Ariel Rokem](mailto:arokem@gmail.com), [Kivan Polimis](mailto:kivan.polimis@gmail.com), or [Bryna Hazelton](mailto:brynah@phys.washington.edu ) if you have any questions, suggestions or feedback.
66 | 
67 | # References
68 | 


--------------------------------------------------------------------------------
/paper/plot_mpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/paper/plot_mpg.png


--------------------------------------------------------------------------------
/paper/plot_mpg_no_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/paper/plot_mpg_no_variance.png


--------------------------------------------------------------------------------
/paper/plot_spam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/paper/plot_spam.png


--------------------------------------------------------------------------------
/paper/plot_spam_no_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/forest-confidence-interval/5241344a342810b3ea21413bf2ebcf4e90ec6976/paper/plot_spam_no_variance.png


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | sphinx
 3 | pytest==5.2.2
 4 | pytest-cov==2.8.1
 5 | flake8
 6 | sphinx_gallery
 7 | sphinx_rtd_theme
 8 | numpydoc
 9 | sphinx-autoapi
10 | matplotlib
11 | pillow
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.20
2 | scikit-learn>=0.23.1
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys, os
 3 | import warnings
 4 | from setuptools import setup, find_packages
 5 | 
 6 | with open('requirements.txt') as f:
 7 |     INSTALL_REQUIRES = [l.strip() for l in f.readlines() if l]
 8 | 
 9 | try:
10 |     import numpy
11 | except ImportError:
12 |     warnings.warn('numpy is required during installation', ImportWarning)
13 | 
14 | try:
15 |     import scipy
16 | except ImportError:
17 |     warnings.warn('scipy is required during installation', ImportWarning)
18 | 
19 | # Get version and release info, which is all stored in forestci/version.py
20 | ver_file = os.path.join('forestci', 'version.py')
21 | with open(ver_file) as f:
22 |     exec(f.read())
23 | 
24 | opts = dict(name=NAME,
25 |             maintainer=MAINTAINER,
26 |             maintainer_email=MAINTAINER_EMAIL,
27 |             description=DESCRIPTION,
28 |             long_description=LONG_DESCRIPTION,
29 |             url=URL,
30 |             download_url=DOWNLOAD_URL,
31 |             license=LICENSE,
32 |             classifiers=CLASSIFIERS,
33 |             author=AUTHOR,
34 |             author_email=AUTHOR_EMAIL,
35 |             platforms=PLATFORMS,
36 |             version=VERSION,
37 |             packages=find_packages(),
38 |             install_requires=INSTALL_REQUIRES)
39 | 
40 | if __name__ == '__main__':
41 |     setup(**opts)
42 | 


--------------------------------------------------------------------------------