├── .gitattribute ├── .gitignore ├── .landscape.yaml ├── .travis.yml ├── Makefile ├── README.rst ├── continuous_integration ├── install.sh └── test_script.sh ├── doc ├── Makefile ├── _templates │ ├── class.rst │ ├── class_with_call.rst │ ├── function.rst │ └── layout.html ├── conf.py ├── images │ └── no_image.png ├── index.rst ├── make.bat ├── references.rst └── sphinxext │ ├── LICENSE.txt │ ├── MANIFEST.in │ ├── README.txt │ ├── gen_rst.py │ ├── install_sphinx_bootstrap_theme.sh │ └── numpy_ext │ ├── __init__.py │ ├── docscrape.py │ ├── docscrape_sphinx.py │ └── numpydoc.py ├── examples ├── README.txt ├── plot_randomized_output_decision_tree.py └── plot_variance_preservation.py ├── random_output_trees ├── __init__.py ├── _sklearn_tree.c ├── _sklearn_tree.pxd ├── _sklearn_tree.pyx ├── _sklearn_tree_utils.c ├── _sklearn_tree_utils.pxd ├── _sklearn_tree_utils.pyx ├── _tree.c ├── _tree.pyx ├── _utils.py ├── datasets.py ├── ensemble │ ├── __init__.py │ ├── _sklearn_forest.py │ ├── forest.py │ ├── lazy_bagging.py │ └── tests │ │ ├── test_forest.py │ │ └── test_lazy_bagging.py ├── random_projection.py ├── setup.py ├── tests │ ├── test_datasets.py │ ├── test_random_projection.py │ ├── test_sklearn_ensemble.py │ ├── test_sklearn_tree.py │ ├── test_transformer.py │ ├── test_tree.py │ └── test_validations.py ├── transformer.py └── tree.py ├── setup.cfg └── setup.py /.gitattribute: -------------------------------------------------------------------------------- 1 | /random_output_trees/_tree.c -diff 2 | /random_output_trees/_sklearn_tree.c -diff 3 | /random_output_trees/_sklearn_tree_utils.c -diff 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | 24 | # PyInstaller 25 | # Usually these files are written by a python script from a template 26 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 27 | *.manifest 28 | *.spec 29 | 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | 34 | # Unit test / coverage reports 35 | htmlcov/ 36 | .tox/ 37 | .coverage 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | 42 | # Translations 43 | *.mo 44 | *.pot 45 | 46 | # Django stuff: 47 | *.log 48 | 49 | # Sphinx documentation 50 | docs/_build/ 51 | 52 | # PyBuilder 53 | target/ 54 | 55 | # Cython 56 | cython_debug/ 57 | .DS_Store 58 | 59 | 60 | doc/_build/ 61 | doc/auto_examples/ 62 | doc/generated/ 63 | .buildinfo 64 | doc/modules/generated 65 | _sources 66 | -------------------------------------------------------------------------------- /.landscape.yaml: -------------------------------------------------------------------------------- 1 | doc-warnings: yes 2 | test-warnings: yes 3 | strictness: veryhigh 4 | max-line-length: 80 5 | autodetect: yes 6 | ignore-paths: 7 | - doc 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | virtualenv: 3 | system_site_packages: true 4 | env: 5 | matrix: 6 | - PYTHON_VERSION="2.7" 7 | COVERAGE="true" NUMPY_VERSION="1.6.2" SCIPY_VERSION="0.11.0" 8 | # This environment tests the oldest supported anaconda env 9 | - PYTHON_VERSION="2.6" 10 | NUMPY_VERSION="1.6.2" SCIPY_VERSION="0.11.0" 11 | # This environment tests the newest supported anaconda env 12 | - PYTHON_VERSION="3.4" 13 | NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.14.0" 14 | install: source continuous_integration/install.sh 15 | script: bash continuous_integration/test_script.sh 16 | after_success: 17 | # Ignore coveralls failures as the coveralls server is not very reliable 18 | # but we don't want travis to report a failure in the github UI just 19 | # because the coverage report failed to be published. 20 | - if [[ "$COVERAGE" == "true" ]]; then coveralls || echo "failed"; fi 21 | cache: apt 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Author: Arnaud Joly 2 | 3 | all: clean inplace test 4 | 5 | clean: 6 | python setup.py clean 7 | 8 | in: inplace 9 | 10 | inplace: 11 | python setup.py build_ext --inplace 12 | 13 | test: 14 | nosetests random_output_trees 15 | 16 | doc: inplace 17 | $(MAKE) -C doc html 18 | 19 | doc-noplot: inplace 20 | $(MAKE) -C doc html-noplot 21 | 22 | view-doc: doc 23 | open doc/_build/html/index.html 24 | 25 | gh-pages: 26 | git checkout master 27 | make doc 28 | rm -rf ../random-output-trees-doc 29 | cp -a doc/_build/html ../random-output-trees-doc 30 | git checkout gh-pages 31 | cp -a ../random-output-trees-doc/* . 32 | echo 'Add new file to git' 33 | git add `ls ../random-output-trees-doc` 34 | git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" 35 | git push origin gh-pages 36 | git checkout master 37 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Random output trees 2 | =================== 3 | 4 | .. image:: https://travis-ci.org/arjoly/random-output-trees.svg?branch=master 5 | :target: https://travis-ci.org/arjoly/random-output-trees 6 | :alt: Build status 7 | 8 | .. image:: https://coveralls.io/repos/arjoly/random-output-trees/badge.png?branch=master 9 | :target: https://coveralls.io/r/arjoly/random-output-trees?branch=master 10 | 11 | .. image:: https://landscape.io/github/arjoly/random-output-trees/master/landscape.svg 12 | :target: https://landscape.io/github/arjoly/random-output-trees/master 13 | :alt: Code Health 14 | 15 | 16 | Random output trees is a python package to grow decision tree ensemble on 17 | randomized output space. The core tree implementation is based on scikit-learn 18 | 0.15.2. All provided estimators and transformers are scikit-learn compatible. 19 | 20 | If you use this package, please cite 21 | 22 | Joly, A., Geurts, P., & Wehenkel, L. (2014). Random forests with random 23 | projections of the output space for high dimensional multi-label 24 | classification. 25 | 26 | ECML-PKDD 2014, Nancy, France 27 | 28 | 29 | The paper is avaiblable at http://orbi.ulg.ac.be/handle/2268/172146. 30 | 31 | Documentation 32 | ------------- 33 | 34 | The documentation is available at http://arjoly.github.io/random-output-trees/ 35 | 36 | 37 | Dependencies 38 | ------------ 39 | 40 | The required dependencies to build the software are Python >= 2.7, 41 | NumPy >= 1.6.2, SciPy >= 0.9, scikit-learn>=0.15.2 and a working C/C++ 42 | compiler. 43 | 44 | For running the examples Matplotlib >= 1.1.1 is required and for running the 45 | tests you need nose >= 1.1.2. 46 | 47 | For making the documentation, Sphinx==1.2.2 and sphinx-bootstrap-theme==0.4.0 48 | are needed. 49 | 50 | 51 | Install 52 | ------- 53 | 54 | This package uses distutils, which is the default way of installing 55 | python modules. To install in your home directory, use:: 56 | 57 | python setup.py install --user 58 | 59 | To install for all users on Unix/Linux:: 60 | 61 | python setup.py build 62 | sudo python setup.py install 63 | 64 | 65 | Development 66 | ----------- 67 | 68 | You can check the latest sources with the command:: 69 | 70 | git clone https://github.com/arjoly/random-output-trees 71 | 72 | or if you have write privileges:: 73 | 74 | git@github.com:arjoly/random-output-trees.git 75 | 76 | After installation, you can launch the test suite from outside the 77 | source directory (you will need to have the ``nose`` package installed):: 78 | 79 | $ nosetests -v random_output_trees 80 | 81 | 82 | Licenses 83 | -------- 84 | 85 | Copyright (c) 2014, Arnaud Joly. All rights reserved. 86 | 87 | Redistribution and use in source and binary forms, with or without 88 | modification, are permitted provided that the following conditions are met: 89 | 90 | 1. Redistributions of source code must retain the above copyright notice, 91 | this list of conditions and the following disclaimer. 92 | 93 | 2. Redistributions in binary form must reproduce the above copyright 94 | notice, this list of conditions and the following disclaimer in the 95 | documentation and/or other materials provided with the distribution. 96 | 97 | 3. Neither the name of the copyright holder nor the names of its 98 | contributors may be used to endorse or promote products derived from 99 | this software without specific prior written permission. 100 | 101 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 102 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 103 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 104 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 105 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 106 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 107 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 108 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 109 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 110 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 111 | POSSIBILITY OF SUCH DAMAGE. 112 | -------------------------------------------------------------------------------- /continuous_integration/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called by the "install" step defined in 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 4 | # The behavior of the script is controlled by environment variabled defined 5 | # in the .travis.yml in the top level folder of the project. 6 | 7 | # License: 3-clause BSD 8 | 9 | # This file is originally from the scikit-learn project 10 | 11 | set -e 12 | 13 | # Fix the compilers to workaround avoid having the Python 3.4 build 14 | # lookup for g++44 unexpectedly. 15 | export CC=gcc 16 | export CXX=g++ 17 | 18 | sudo apt-get update -qq 19 | 20 | # Deactivate the travis-provided virtual environment and setup a 21 | # conda-based environment instead 22 | deactivate 23 | 24 | # Use the miniconda installer for faster download / install of conda 25 | # itself 26 | wget http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh \ 27 | -O miniconda.sh 28 | chmod +x miniconda.sh && ./miniconda.sh -b 29 | export PATH=/home/travis/miniconda/bin:$PATH 30 | conda update --yes conda 31 | 32 | # Configure the conda environment and put it in the path using the 33 | # provided versions 34 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ 35 | numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION 36 | source activate testenv 37 | 38 | python --version 39 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 40 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 41 | 42 | pip install scikit-learn 43 | 44 | python -c "import sklearn; print('sklearn %s' % sklearn.__version__)" 45 | python setup.py build_ext --inplace 46 | -------------------------------------------------------------------------------- /continuous_integration/test_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called by the "script" step defined in 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 4 | # The behavior of the script is controlled by environment variabled defined 5 | # in the .travis.yml in the top level folder of the project. 6 | 7 | # License: 3-clause BSD 8 | 9 | # This file is originally from the scikit-learn project 10 | 11 | set -e 12 | 13 | python --version 14 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 15 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 16 | python -c "import sklearn; print('sklearn %s' % sklearn.__version__)" 17 | 18 | # Do not use "make test" or "make test-coverage" as they enable verbose mode 19 | # which renders travis output too slow to display in a browser. 20 | if [[ "$COVERAGE" == "true" ]]; then 21 | nosetests -s --with-coverage random_output_trees 22 | else 23 | nosetests -s random_output_trees 24 | fi 25 | 26 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/random_output_trees.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/random_output_trees.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/random_output_trees" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/random_output_trees" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /doc/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | .. automethod:: __init__ 10 | {% endblock %} 11 | 12 | 13 | -------------------------------------------------------------------------------- /doc/_templates/class_with_call.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | .. automethod:: __init__ 10 | .. automethod:: __call__ 11 | {% endblock %} 12 | 13 | 14 | -------------------------------------------------------------------------------- /doc/_templates/function.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autofunction:: {{ objname }} 7 | 8 | 9 | -------------------------------------------------------------------------------- /doc/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {# Import the theme's layout. #} 2 | {% extends "!layout.html" %} 3 | 4 | {# remove site and page menus #} 5 | {%- block sidebartoc %} 6 | {% endblock %} 7 | {%- block sidebarrel %} 8 | {% endblock %} 9 | 10 | {%- block navbartoc %} 11 | {% endblock %} 12 | 13 | {# Include our new CSS file into existing ones. #} 14 | {% set css_files = css_files + ['_static/bootstrap.min.css']%} 15 | 16 | {%- block content %} 17 | {{ navBar() }} 18 |
19 | {% block body %}{% endblock %} 20 |
21 | 22 | {%- endblock %} 23 | 24 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # random_output_trees documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Aug 20 10:22:49 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | sys.path.append(os.path.abspath('sphinxext')) 23 | 24 | import sphinx_bootstrap_theme 25 | 26 | # Try to override the matplotlib configuration as early as possible 27 | try: 28 | import gen_rst 29 | except: 30 | pass 31 | 32 | # -- General configuration ------------------------------------------------ 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | #needs_sphinx = '1.0' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | 'gen_rst', 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.autosummary', 44 | 'sphinx.ext.doctest', 45 | 'sphinx.ext.mathjax', 46 | # 'sphinx.ext.viewcode', 47 | 'numpy_ext.numpydoc', 48 | ] 49 | 50 | # Generate autosummary even if no references 51 | autosummary_generate = True 52 | autodoc_default_flags = ['members', 'inherited-members'] 53 | 54 | # Add any paths that contain templates here, relative to this directory. 55 | templates_path = ['_templates'] 56 | 57 | # The suffix of source filenames. 58 | source_suffix = '.rst' 59 | 60 | # The encoding of source files. 61 | #source_encoding = 'utf-8-sig' 62 | 63 | 64 | # Generate the plots for the gallery 65 | plot_gallery = True 66 | 67 | # The master toctree document. 68 | master_doc = 'index' 69 | 70 | # General information about the project. 71 | project = u'random_output_trees' 72 | copyright = u'2014, Arnaud Joly' 73 | 74 | # The version info for the project you're documenting, acts as replacement for 75 | # |version| and |release|, also used in various other places throughout the 76 | # built documents. 77 | # 78 | # The short X.Y version. 79 | version = 'dev' 80 | # The full version, including alpha/beta/rc tags. 81 | release = 'dev' 82 | 83 | # The language for content autogenerated by Sphinx. Refer to documentation 84 | # for a list of supported languages. 85 | #language = None 86 | 87 | # There are two options for replacing |today|: either, you set today to some 88 | # non-false value, then it is used: 89 | #today = '' 90 | # Else, today_fmt is used as the format for a strftime call. 91 | #today_fmt = '%B %d, %Y' 92 | 93 | # List of patterns, relative to source directory, that match files and 94 | # directories to ignore when looking for source files. 95 | exclude_patterns = ['_build'] 96 | 97 | # The reST default role (used for this markup: `text`) to use for all 98 | # documents. 99 | #default_role = None 100 | 101 | # If true, '()' will be appended to :func: etc. cross-reference text. 102 | #add_function_parentheses = True 103 | 104 | # If true, the current module name will be prepended to all description 105 | # unit titles (such as .. function::). 106 | #add_module_names = True 107 | 108 | # If true, sectionauthor and moduleauthor directives will be shown in the 109 | # output. They are ignored by default. 110 | #show_authors = False 111 | 112 | # The name of the Pygments (syntax highlighting) style to use. 113 | pygments_style = 'sphinx' 114 | 115 | # A list of ignored prefixes for module index sorting. 116 | #modindex_common_prefix = [] 117 | 118 | # If true, keep warnings as "system message" paragraphs in the built documents. 119 | #keep_warnings = False 120 | 121 | 122 | # -- Options for HTML output ---------------------------------------------- 123 | 124 | # The theme to use for HTML and HTML Help pages. See the documentation for 125 | # a list of builtin themes. 126 | html_theme = 'bootstrap' 127 | 128 | # Theme options are theme-specific and customize the look and feel of a theme 129 | # further. For a list of options available for each theme, see the 130 | # documentation. 131 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 132 | 133 | # Add any paths that contain custom themes here, relative to this directory. 134 | #html_theme_path = [] 135 | 136 | # The name for this set of Sphinx documents. If None, it defaults to 137 | # " v documentation". 138 | #html_title = None 139 | 140 | # A shorter title for the navigation bar. Default is the same as html_title. 141 | #html_short_title = None 142 | 143 | # The name of an image file (relative to this directory) to place at the top 144 | # of the sidebar. 145 | #html_logo = None 146 | 147 | # The name of an image file (within the static path) to use as favicon of the 148 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 149 | # pixels large. 150 | #html_favicon = None 151 | 152 | # Add any paths that contain custom static files (such as style sheets) here, 153 | # relative to this directory. They are copied after the builtin static files, 154 | # so a file named "default.css" will overwrite the builtin "default.css". 155 | html_static_path = ['_static'] 156 | 157 | # Add any extra paths that contain custom files (such as robots.txt or 158 | # .htaccess) here, relative to this directory. These files are copied 159 | # directly to the root of the documentation. 160 | #html_extra_path = [] 161 | 162 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 163 | # using the given strftime format. 164 | #html_last_updated_fmt = '%b %d, %Y' 165 | 166 | # If true, SmartyPants will be used to convert quotes and dashes to 167 | # typographically correct entities. 168 | #html_use_smartypants = True 169 | 170 | # Custom sidebar templates, maps document names to template names. 171 | #html_sidebars = {} 172 | 173 | # Additional templates that should be rendered to pages, maps page names to 174 | # template names. 175 | #html_additional_pages = {} 176 | 177 | # If false, no module index is generated. 178 | #html_domain_indices = True 179 | 180 | # If false, no index is generated. 181 | #html_use_index = True 182 | 183 | # If true, the index is split into individual pages for each letter. 184 | #html_split_index = False 185 | 186 | # If true, links to the reST sources are added to the pages. 187 | #html_show_sourcelink = True 188 | 189 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 190 | html_show_sphinx = False 191 | 192 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 193 | #html_show_copyright = True 194 | 195 | # If true, an OpenSearch description file will be output, and all pages will 196 | # contain a tag referring to it. The value of this option must be the 197 | # base URL from which the finished HTML is served. 198 | #html_use_opensearch = '' 199 | 200 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 201 | #html_file_suffix = None 202 | 203 | # Output file base name for HTML help builder. 204 | htmlhelp_basename = 'random_output_treesdoc' 205 | 206 | # Theme options are theme-specific and customize the look and feel of a 207 | # theme further. 208 | html_theme_options = { 209 | # Navigation bar title. (Default: ``project`` value) 210 | 'navbar_title': "Randommized output forest", 211 | 212 | # Tab name for entire site. (Default: "Site") 213 | # 'navbar_site_name': "Site", 214 | 215 | # A list of tuples containing pages or urls to link to. 216 | # Valid tuples should be in the following forms: 217 | # (name, page) # a link to a page 218 | # (name, "/aa/bb", 1) # a link to an arbitrary relative url 219 | # (name, "http://example.com", True) # arbitrary absolute url 220 | # Note the "1" or "True" value above as the third argument to indicate 221 | # an arbitrary url. 222 | 'navbar_links': [ 223 | ("References", "references"), 224 | ("Examples", "auto_examples/index"), 225 | ], 226 | 227 | # Render the next and previous page links in navbar. (Default: true) 228 | 'navbar_sidebarrel': False, 229 | 230 | # Render the current pages TOC in the navbar. (Default: true) 231 | 'navbar_pagenav': False, 232 | 233 | # Global TOC depth for "site" navbar tab. (Default: 1) 234 | # Switching to -1 shows all levels. 235 | 'globaltoc_depth': 0, 236 | 237 | # Include hidden TOCs in Site navbar? 238 | # 239 | # Note: If this is "false", you cannot have mixed ``:hidden:`` and 240 | # non-hidden ``toctree`` directives in the same page, or else the build 241 | # will break. 242 | # 243 | # Values: "true" (default) or "false" 244 | 'globaltoc_includehidden': "false", 245 | 246 | # HTML navbar class (Default: "navbar") to attach to
element. 247 | # For black navbar, do "navbar navbar-inverse" 248 | 'navbar_class': "navbar", 249 | 250 | # Fix navigation bar to top of page? 251 | # Values: "true" (default) or "false" 252 | 'navbar_fixed_top': "true", 253 | 254 | # Location of link to source. 255 | # Options are "nav" (default), "footer" or anything else to exclude. 256 | 'source_link_position': "None", 257 | 258 | # Bootswatch (http://bootswatch.com/) theme. 259 | # 260 | # Options are nothing with "" (default) or the name of a valid theme 261 | # such as "amelia" or "cosmo". 262 | 'bootswatch_theme': "lumen", 263 | 264 | # Choose Bootstrap version. 265 | # Values: "3" (default) or "2" (in quotes) 266 | 'bootstrap_version': "3", 267 | } 268 | 269 | 270 | # -- Options for LaTeX output --------------------------------------------- 271 | 272 | latex_elements = { 273 | # The paper size ('letterpaper' or 'a4paper'). 274 | #'papersize': 'letterpaper', 275 | 276 | # The font size ('10pt', '11pt' or '12pt'). 277 | #'pointsize': '10pt', 278 | 279 | # Additional stuff for the LaTeX preamble. 280 | #'preamble': '', 281 | } 282 | 283 | # Grouping the document tree into LaTeX files. List of tuples 284 | # (source start file, target name, title, 285 | # author, documentclass [howto, manual, or own class]). 286 | latex_documents = [ 287 | ('index', 'random_output_trees.tex', u'randomized\\_output\\_forest Documentation', 288 | u'Arnaud Joly', 'manual'), 289 | ] 290 | 291 | # The name of an image file (relative to this directory) to place at the top of 292 | # the title page. 293 | #latex_logo = None 294 | 295 | # For "manual" documents, if this is true, then toplevel headings are parts, 296 | # not chapters. 297 | #latex_use_parts = False 298 | 299 | # If true, show page references after internal links. 300 | #latex_show_pagerefs = False 301 | 302 | # If true, show URL addresses after external links. 303 | #latex_show_urls = False 304 | 305 | # Documents to append as an appendix to all manuals. 306 | #latex_appendices = [] 307 | 308 | # If false, no module index is generated. 309 | #latex_domain_indices = True 310 | 311 | 312 | # -- Options for manual page output --------------------------------------- 313 | 314 | # One entry per manual page. List of tuples 315 | # (source start file, name, description, authors, manual section). 316 | man_pages = [ 317 | ('index', 'random_output_trees', u'random_output_trees Documentation', 318 | [u'Arnaud Joly'], 1) 319 | ] 320 | 321 | # If true, show URL addresses after external links. 322 | #man_show_urls = False 323 | 324 | 325 | # -- Options for Texinfo output ------------------------------------------- 326 | 327 | # Grouping the document tree into Texinfo files. List of tuples 328 | # (source start file, target name, title, author, 329 | # dir menu entry, description, category) 330 | texinfo_documents = [ 331 | ('index', 'random_output_trees', u'random_output_trees Documentation', 332 | u'Arnaud Joly', 'random_output_trees', 'One line description of project.', 333 | 'Miscellaneous'), 334 | ] 335 | 336 | # Documents to append as an appendix to all manuals. 337 | #texinfo_appendices = [] 338 | 339 | # If false, no module index is generated. 340 | #texinfo_domain_indices = True 341 | 342 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 343 | #texinfo_show_urls = 'footnote' 344 | 345 | # If true, do not generate a @detailmenu in the "Top" node's menu. 346 | #texinfo_no_detailmenu = False 347 | -------------------------------------------------------------------------------- /doc/images/no_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjoly/random-output-trees/4251a3ab99cf7b893b7dcb47b62be94ed74c1ab9/doc/images/no_image.png -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | .. toctree:: 4 | :hidden: 5 | 6 | auto_examples/index.rst 7 | references.rst 8 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\random_output_trees.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\random_output_trees.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /doc/references.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | References 3 | ========== 4 | 5 | This is the class and function reference of the package. 6 | 7 | 8 | :mod:`random_output_trees.ensemble`: Ensemble 9 | --------------------------------------------- 10 | 11 | .. automodule:: random_output_trees.ensemble 12 | :no-members: 13 | :no-inherited-members: 14 | 15 | .. currentmodule:: random_output_trees 16 | 17 | .. autosummary:: 18 | :toctree: generated/ 19 | :template: class.rst 20 | 21 | ensemble.ExtraTreesClassifier 22 | ensemble.ExtraTreesRegressor 23 | ensemble.LazyBaggingClassifier 24 | ensemble.LazyBaggingRegressor 25 | ensemble.RandomForestClassifier 26 | ensemble.RandomForestRegressor 27 | 28 | :mod:`random_output_trees.datasets`: Datasets 29 | --------------------------------------------- 30 | 31 | .. automodule:: random_output_trees.datasets 32 | :no-members: 33 | :no-inherited-members: 34 | 35 | .. currentmodule:: random_output_trees 36 | 37 | .. autosummary:: 38 | :toctree: generated/ 39 | :template: function.rst 40 | 41 | datasets.fetch_drug_interaction 42 | datasets.fetch_protein_interaction 43 | 44 | 45 | :mod:`random_output_trees.random_projection`: Random projection 46 | --------------------------------------------------------------- 47 | 48 | .. automodule:: random_output_trees.random_projection 49 | :no-members: 50 | :no-inherited-members: 51 | 52 | .. currentmodule:: random_output_trees 53 | 54 | .. autosummary:: 55 | :toctree: generated/ 56 | :template: class.rst 57 | 58 | random_projection.RademacherRandomProjection 59 | random_projection.AchlioptasRandomProjection 60 | random_projection.SampledHadamardProjection 61 | random_projection.SampledIdentityProjection 62 | 63 | 64 | :mod:`random_output_trees.transformer`: Transformer 65 | --------------------------------------------------- 66 | 67 | .. automodule:: random_output_trees.transformer 68 | :no-members: 69 | :no-inherited-members: 70 | 71 | .. currentmodule:: random_output_trees 72 | 73 | .. autosummary:: 74 | :toctree: generated/ 75 | :template: class.rst 76 | 77 | transformer.FixedStateTransformer 78 | 79 | 80 | :mod:`random_output_trees.tree`: Tree 81 | ------------------------------------- 82 | 83 | 84 | .. automodule:: random_output_trees.tree 85 | :no-members: 86 | :no-inherited-members: 87 | 88 | .. currentmodule:: random_output_trees 89 | 90 | .. autosummary:: 91 | :toctree: generated/ 92 | :template: class.rst 93 | 94 | tree.DecisionTreeClassifier 95 | tree.DecisionTreeRegressor 96 | -------------------------------------------------------------------------------- /doc/sphinxext/LICENSE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | The files 3 | - numpydoc.py 4 | - autosummary.py 5 | - autosummary_generate.py 6 | - docscrape.py 7 | - docscrape_sphinx.py 8 | - phantom_import.py 9 | have the following license: 10 | 11 | Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are 15 | met: 16 | 17 | 1. Redistributions of source code must retain the above copyright 18 | notice, this list of conditions and the following disclaimer. 19 | 2. Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in 21 | the documentation and/or other materials provided with the 22 | distribution. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 25 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 26 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, 28 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 29 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 32 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 33 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | POSSIBILITY OF SUCH DAMAGE. 35 | 36 | ------------------------------------------------------------------------------- 37 | The files 38 | - compiler_unparse.py 39 | - comment_eater.py 40 | - traitsdoc.py 41 | have the following license: 42 | 43 | This software is OSI Certified Open Source Software. 44 | OSI Certified is a certification mark of the Open Source Initiative. 45 | 46 | Copyright (c) 2006, Enthought, Inc. 47 | All rights reserved. 48 | 49 | Redistribution and use in source and binary forms, with or without 50 | modification, are permitted provided that the following conditions are met: 51 | 52 | * Redistributions of source code must retain the above copyright notice, this 53 | list of conditions and the following disclaimer. 54 | * Redistributions in binary form must reproduce the above copyright notice, 55 | this list of conditions and the following disclaimer in the documentation 56 | and/or other materials provided with the distribution. 57 | * Neither the name of Enthought, Inc. nor the names of its contributors may 58 | be used to endorse or promote products derived from this software without 59 | specific prior written permission. 60 | 61 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 62 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 63 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 64 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 65 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 66 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 67 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 68 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 69 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 70 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 71 | 72 | 73 | ------------------------------------------------------------------------------- 74 | The files 75 | - only_directives.py 76 | - plot_directive.py 77 | originate from Matplotlib (http://matplotlib.sf.net/) which has 78 | the following license: 79 | 80 | Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 81 | 82 | 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 83 | 84 | 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 85 | 86 | 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 87 | 88 | 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 89 | 90 | 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 91 | 92 | 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 93 | 94 | 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 95 | 96 | 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. 97 | 98 | -------------------------------------------------------------------------------- /doc/sphinxext/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests *.py 2 | include *.txt 3 | -------------------------------------------------------------------------------- /doc/sphinxext/README.txt: -------------------------------------------------------------------------------- 1 | ===================================== 2 | numpydoc -- Numpy's Sphinx extensions 3 | ===================================== 4 | 5 | Numpy's documentation uses several custom extensions to Sphinx. These 6 | are shipped in this ``numpydoc`` package, in case you want to make use 7 | of them in third-party projects. 8 | 9 | The following extensions are available: 10 | 11 | - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add 12 | the code description directives ``np-function``, ``np-cfunction``, etc. 13 | that support the Numpy docstring syntax. 14 | 15 | - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. 16 | 17 | - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` 18 | directive. Note that this implementation may still undergo severe 19 | changes or eventually be deprecated. 20 | 21 | - ``numpydoc.only_directives``: (DEPRECATED) 22 | 23 | - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. 24 | Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, 25 | and it the Sphinx 1.0 version is recommended over that included in 26 | Numpydoc. 27 | 28 | 29 | numpydoc 30 | ======== 31 | 32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings 33 | following the Numpy/Scipy format to a form palatable to Sphinx. 34 | 35 | Options 36 | ------- 37 | 38 | The following options can be set in conf.py: 39 | 40 | - numpydoc_use_plots: bool 41 | 42 | Whether to produce ``plot::`` directives for Examples sections that 43 | contain ``import matplotlib``. 44 | 45 | - numpydoc_show_class_members: bool 46 | 47 | Whether to show all members of a class in the Methods and Attributes 48 | sections automatically. 49 | 50 | - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) 51 | 52 | Whether to insert an edit link after docstrings. 53 | -------------------------------------------------------------------------------- /doc/sphinxext/install_sphinx_bootstrap_theme.sh: -------------------------------------------------------------------------------- 1 | # remove prior version if any 2 | rm -rf sphinx_bootstrap_theme 3 | 4 | # Download and untar 5 | wget https://pypi.python.org/packages/source/s/sphinx-bootstrap-theme/sphinx-bootstrap-theme-0.4.0.tar.gz 6 | tar -zxf sphinx-bootstrap-theme-0.4.0.tar.gz 7 | rm sphinx-bootstrap-theme-0.4.0.tar.gz 8 | 9 | # Move everything to sphinx_bootstrap_theme 10 | mv sphinx-bootstrap-theme-0.4.0/sphinx_bootstrap_theme . 11 | mv sphinx-bootstrap-theme-0.4.0/*.txt sphinx_bootstrap_theme 12 | mv sphinx-bootstrap-theme-0.4.0/*.in sphinx_bootstrap_theme 13 | 14 | # Clean theme that we don't want 15 | # rm -rf -ignore myfile.txt * 16 | rm -rf sphinx_bootstrap_theme/bootstrap/static/bootstrap-2.* 17 | rm -rf sphinx_bootstrap_theme/bootstrap/static/bootswatch-2.* 18 | 19 | # remove all bootstwatch theme except one 20 | mv sphinx_bootstrap_theme/bootstrap/static/bootswatch-3.1.0/lumen . 21 | rm -rf sphinx_bootstrap_theme/bootstrap/static/bootswatch-3.1.0/ 22 | mkdir sphinx_bootstrap_theme/bootstrap/static/bootswatch-3.1.0 23 | mv lumen sphinx_bootstrap_theme/bootstrap/static/bootswatch-3.1.0/ 24 | 25 | 26 | 27 | # Clean remaining files 28 | rm -rf sphinx-bootstrap-theme-0.4.0 29 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjoly/random-output-trees/4251a3ab99cf7b893b7dcb47b62be94ed74c1ab9/doc/sphinxext/numpy_ext/__init__.py -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/docscrape.py: -------------------------------------------------------------------------------- 1 | """Extract reference documentation from the NumPy source tree. 2 | 3 | """ 4 | 5 | import inspect 6 | import textwrap 7 | import re 8 | import pydoc 9 | from warnings import warn 10 | # Try Python 2 first, otherwise load from Python 3 11 | try: 12 | from StringIO import StringIO 13 | except: 14 | from io import StringIO 15 | 16 | 17 | class Reader(object): 18 | """A line-based string reader. 19 | 20 | """ 21 | def __init__(self, data): 22 | """ 23 | Parameters 24 | ---------- 25 | data : str 26 | String with lines separated by '\n'. 27 | 28 | """ 29 | if isinstance(data, list): 30 | self._str = data 31 | else: 32 | self._str = data.split('\n') # store string as list of lines 33 | 34 | self.reset() 35 | 36 | def __getitem__(self, n): 37 | return self._str[n] 38 | 39 | def reset(self): 40 | self._l = 0 # current line nr 41 | 42 | def read(self): 43 | if not self.eof(): 44 | out = self[self._l] 45 | self._l += 1 46 | return out 47 | else: 48 | return '' 49 | 50 | def seek_next_non_empty_line(self): 51 | for l in self[self._l:]: 52 | if l.strip(): 53 | break 54 | else: 55 | self._l += 1 56 | 57 | def eof(self): 58 | return self._l >= len(self._str) 59 | 60 | def read_to_condition(self, condition_func): 61 | start = self._l 62 | for line in self[start:]: 63 | if condition_func(line): 64 | return self[start:self._l] 65 | self._l += 1 66 | if self.eof(): 67 | return self[start:self._l + 1] 68 | return [] 69 | 70 | def read_to_next_empty_line(self): 71 | self.seek_next_non_empty_line() 72 | 73 | def is_empty(line): 74 | return not line.strip() 75 | return self.read_to_condition(is_empty) 76 | 77 | def read_to_next_unindented_line(self): 78 | def is_unindented(line): 79 | return (line.strip() and (len(line.lstrip()) == len(line))) 80 | return self.read_to_condition(is_unindented) 81 | 82 | def peek(self, n=0): 83 | if self._l + n < len(self._str): 84 | return self[self._l + n] 85 | else: 86 | return '' 87 | 88 | def is_empty(self): 89 | return not ''.join(self._str).strip() 90 | 91 | 92 | class NumpyDocString(object): 93 | def __init__(self, docstring, config={}): 94 | docstring = textwrap.dedent(docstring).split('\n') 95 | 96 | self._doc = Reader(docstring) 97 | self._parsed_data = { 98 | 'Signature': '', 99 | 'Summary': [''], 100 | 'Extended Summary': [], 101 | 'Parameters': [], 102 | 'Returns': [], 103 | 'Raises': [], 104 | 'Warns': [], 105 | 'Other Parameters': [], 106 | 'Attributes': [], 107 | 'Methods': [], 108 | 'See Also': [], 109 | 'Notes': [], 110 | 'Warnings': [], 111 | 'References': '', 112 | 'Examples': '', 113 | 'index': {} 114 | } 115 | 116 | self._parse() 117 | 118 | def __getitem__(self, key): 119 | return self._parsed_data[key] 120 | 121 | def __setitem__(self, key, val): 122 | if key not in self._parsed_data: 123 | warn("Unknown section %s" % key) 124 | else: 125 | self._parsed_data[key] = val 126 | 127 | def _is_at_section(self): 128 | self._doc.seek_next_non_empty_line() 129 | 130 | if self._doc.eof(): 131 | return False 132 | 133 | l1 = self._doc.peek().strip() # e.g. Parameters 134 | 135 | if l1.startswith('.. index::'): 136 | return True 137 | 138 | l2 = self._doc.peek(1).strip() # ---------- or ========== 139 | return l2.startswith('-' * len(l1)) or l2.startswith('=' * len(l1)) 140 | 141 | def _strip(self, doc): 142 | i = 0 143 | j = 0 144 | for i, line in enumerate(doc): 145 | if line.strip(): 146 | break 147 | 148 | for j, line in enumerate(doc[::-1]): 149 | if line.strip(): 150 | break 151 | 152 | return doc[i:len(doc) - j] 153 | 154 | def _read_to_next_section(self): 155 | section = self._doc.read_to_next_empty_line() 156 | 157 | while not self._is_at_section() and not self._doc.eof(): 158 | if not self._doc.peek(-1).strip(): # previous line was empty 159 | section += [''] 160 | 161 | section += self._doc.read_to_next_empty_line() 162 | 163 | return section 164 | 165 | def _read_sections(self): 166 | while not self._doc.eof(): 167 | data = self._read_to_next_section() 168 | name = data[0].strip() 169 | 170 | if name.startswith('..'): # index section 171 | yield name, data[1:] 172 | elif len(data) < 2: 173 | yield StopIteration 174 | else: 175 | yield name, self._strip(data[2:]) 176 | 177 | def _parse_param_list(self, content): 178 | r = Reader(content) 179 | params = [] 180 | while not r.eof(): 181 | header = r.read().strip() 182 | if ' : ' in header: 183 | arg_name, arg_type = header.split(' : ')[:2] 184 | else: 185 | arg_name, arg_type = header, '' 186 | 187 | desc = r.read_to_next_unindented_line() 188 | desc = dedent_lines(desc) 189 | 190 | params.append((arg_name, arg_type, desc)) 191 | 192 | return params 193 | 194 | _name_rgx = re.compile(r"^\s*(:(?P\w+):`(?P[a-zA-Z0-9_.-]+)`|" 195 | r" (?P[a-zA-Z0-9_.-]+))\s*", re.X) 196 | 197 | def _parse_see_also(self, content): 198 | """ 199 | func_name : Descriptive text 200 | continued text 201 | another_func_name : Descriptive text 202 | func_name1, func_name2, :meth:`func_name`, func_name3 203 | 204 | """ 205 | items = [] 206 | 207 | def parse_item_name(text): 208 | """Match ':role:`name`' or 'name'""" 209 | m = self._name_rgx.match(text) 210 | if m: 211 | g = m.groups() 212 | if g[1] is None: 213 | return g[3], None 214 | else: 215 | return g[2], g[1] 216 | raise ValueError("%s is not a item name" % text) 217 | 218 | def push_item(name, rest): 219 | if not name: 220 | return 221 | name, role = parse_item_name(name) 222 | items.append((name, list(rest), role)) 223 | del rest[:] 224 | 225 | current_func = None 226 | rest = [] 227 | 228 | for line in content: 229 | if not line.strip(): 230 | continue 231 | 232 | m = self._name_rgx.match(line) 233 | if m and line[m.end():].strip().startswith(':'): 234 | push_item(current_func, rest) 235 | current_func, line = line[:m.end()], line[m.end():] 236 | rest = [line.split(':', 1)[1].strip()] 237 | if not rest[0]: 238 | rest = [] 239 | elif not line.startswith(' '): 240 | push_item(current_func, rest) 241 | current_func = None 242 | if ',' in line: 243 | for func in line.split(','): 244 | push_item(func, []) 245 | elif line.strip(): 246 | current_func = line 247 | elif current_func is not None: 248 | rest.append(line.strip()) 249 | push_item(current_func, rest) 250 | return items 251 | 252 | def _parse_index(self, section, content): 253 | """ 254 | .. index: default 255 | :refguide: something, else, and more 256 | 257 | """ 258 | def strip_each_in(lst): 259 | return [s.strip() for s in lst] 260 | 261 | out = {} 262 | section = section.split('::') 263 | if len(section) > 1: 264 | out['default'] = strip_each_in(section[1].split(','))[0] 265 | for line in content: 266 | line = line.split(':') 267 | if len(line) > 2: 268 | out[line[1]] = strip_each_in(line[2].split(',')) 269 | return out 270 | 271 | def _parse_summary(self): 272 | """Grab signature (if given) and summary""" 273 | if self._is_at_section(): 274 | return 275 | 276 | summary = self._doc.read_to_next_empty_line() 277 | summary_str = " ".join([s.strip() for s in summary]).strip() 278 | if re.compile('^([\w., ]+=)?\s*[\w\.]+\(.*\)$').match(summary_str): 279 | self['Signature'] = summary_str 280 | if not self._is_at_section(): 281 | self['Summary'] = self._doc.read_to_next_empty_line() 282 | else: 283 | self['Summary'] = summary 284 | 285 | if not self._is_at_section(): 286 | self['Extended Summary'] = self._read_to_next_section() 287 | 288 | def _parse(self): 289 | self._doc.reset() 290 | self._parse_summary() 291 | 292 | for (section, content) in self._read_sections(): 293 | if not section.startswith('..'): 294 | section = ' '.join([s.capitalize() 295 | for s in section.split(' ')]) 296 | if section in ('Parameters', 'Attributes', 'Methods', 297 | 'Returns', 'Raises', 'Warns'): 298 | self[section] = self._parse_param_list(content) 299 | elif section.startswith('.. index::'): 300 | self['index'] = self._parse_index(section, content) 301 | elif section == 'See Also': 302 | self['See Also'] = self._parse_see_also(content) 303 | else: 304 | self[section] = content 305 | 306 | # string conversion routines 307 | 308 | def _str_header(self, name, symbol='-'): 309 | return [name, len(name) * symbol] 310 | 311 | def _str_indent(self, doc, indent=4): 312 | out = [] 313 | for line in doc: 314 | out += [' ' * indent + line] 315 | return out 316 | 317 | def _str_signature(self): 318 | if self['Signature']: 319 | return [self['Signature'].replace('*', '\*')] + [''] 320 | else: 321 | return [''] 322 | 323 | def _str_summary(self): 324 | if self['Summary']: 325 | return self['Summary'] + [''] 326 | else: 327 | return [] 328 | 329 | def _str_extended_summary(self): 330 | if self['Extended Summary']: 331 | return self['Extended Summary'] + [''] 332 | else: 333 | return [] 334 | 335 | def _str_param_list(self, name): 336 | out = [] 337 | if self[name]: 338 | out += self._str_header(name) 339 | for param, param_type, desc in self[name]: 340 | out += ['%s : %s' % (param, param_type)] 341 | out += self._str_indent(desc) 342 | out += [''] 343 | return out 344 | 345 | def _str_section(self, name): 346 | out = [] 347 | if self[name]: 348 | out += self._str_header(name) 349 | out += self[name] 350 | out += [''] 351 | return out 352 | 353 | def _str_see_also(self, func_role): 354 | if not self['See Also']: 355 | return [] 356 | out = [] 357 | out += self._str_header("See Also") 358 | last_had_desc = True 359 | for func, desc, role in self['See Also']: 360 | if role: 361 | link = ':%s:`%s`' % (role, func) 362 | elif func_role: 363 | link = ':%s:`%s`' % (func_role, func) 364 | else: 365 | link = "`%s`_" % func 366 | if desc or last_had_desc: 367 | out += [''] 368 | out += [link] 369 | else: 370 | out[-1] += ", %s" % link 371 | if desc: 372 | out += self._str_indent([' '.join(desc)]) 373 | last_had_desc = True 374 | else: 375 | last_had_desc = False 376 | out += [''] 377 | return out 378 | 379 | def _str_index(self): 380 | idx = self['index'] 381 | out = [] 382 | out += ['.. index:: %s' % idx.get('default', '')] 383 | for section, references in idx.iteritems(): 384 | if section == 'default': 385 | continue 386 | out += [' :%s: %s' % (section, ', '.join(references))] 387 | return out 388 | 389 | def __str__(self, func_role=''): 390 | out = [] 391 | out += self._str_signature() 392 | out += self._str_summary() 393 | out += self._str_extended_summary() 394 | for param_list in ('Parameters', 'Returns', 'Raises'): 395 | out += self._str_param_list(param_list) 396 | out += self._str_section('Warnings') 397 | out += self._str_see_also(func_role) 398 | for s in ('Notes', 'References', 'Examples'): 399 | out += self._str_section(s) 400 | for param_list in ('Attributes', 'Methods'): 401 | out += self._str_param_list(param_list) 402 | out += self._str_index() 403 | return '\n'.join(out) 404 | 405 | 406 | def indent(str, indent=4): 407 | indent_str = ' ' * indent 408 | if str is None: 409 | return indent_str 410 | lines = str.split('\n') 411 | return '\n'.join(indent_str + l for l in lines) 412 | 413 | 414 | def dedent_lines(lines): 415 | """Deindent a list of lines maximally""" 416 | return textwrap.dedent("\n".join(lines)).split("\n") 417 | 418 | 419 | def header(text, style='-'): 420 | return text + '\n' + style * len(text) + '\n' 421 | 422 | 423 | class FunctionDoc(NumpyDocString): 424 | def __init__(self, func, role='func', doc=None, config={}): 425 | self._f = func 426 | self._role = role # e.g. "func" or "meth" 427 | 428 | if doc is None: 429 | if func is None: 430 | raise ValueError("No function or docstring given") 431 | doc = inspect.getdoc(func) or '' 432 | NumpyDocString.__init__(self, doc) 433 | 434 | if not self['Signature'] and func is not None: 435 | func, func_name = self.get_func() 436 | try: 437 | # try to read signature 438 | argspec = inspect.getargspec(func) 439 | argspec = inspect.formatargspec(*argspec) 440 | argspec = argspec.replace('*', '\*') 441 | signature = '%s%s' % (func_name, argspec) 442 | except TypeError as e: 443 | signature = '%s()' % func_name 444 | self['Signature'] = signature 445 | 446 | def get_func(self): 447 | func_name = getattr(self._f, '__name__', self.__class__.__name__) 448 | if inspect.isclass(self._f): 449 | func = getattr(self._f, '__call__', self._f.__init__) 450 | else: 451 | func = self._f 452 | return func, func_name 453 | 454 | def __str__(self): 455 | out = '' 456 | 457 | func, func_name = self.get_func() 458 | signature = self['Signature'].replace('*', '\*') 459 | 460 | roles = {'func': 'function', 461 | 'meth': 'method'} 462 | 463 | if self._role: 464 | if not roles.has_key(self._role): 465 | print("Warning: invalid role %s" % self._role) 466 | out += '.. %s:: %s\n \n\n' % (roles.get(self._role, ''), 467 | func_name) 468 | 469 | out += super(FunctionDoc, self).__str__(func_role=self._role) 470 | return out 471 | 472 | 473 | class ClassDoc(NumpyDocString): 474 | def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, 475 | config=None): 476 | if not inspect.isclass(cls) and cls is not None: 477 | raise ValueError("Expected a class or None, but got %r" % cls) 478 | self._cls = cls 479 | 480 | if modulename and not modulename.endswith('.'): 481 | modulename += '.' 482 | self._mod = modulename 483 | 484 | if doc is None: 485 | if cls is None: 486 | raise ValueError("No class or documentation string given") 487 | doc = pydoc.getdoc(cls) 488 | 489 | NumpyDocString.__init__(self, doc) 490 | 491 | if config is not None and config.get('show_class_members', True): 492 | if not self['Methods']: 493 | self['Methods'] = [(name, '', '') 494 | for name in sorted(self.methods)] 495 | if not self['Attributes']: 496 | self['Attributes'] = [(name, '', '') 497 | for name in sorted(self.properties)] 498 | 499 | @property 500 | def methods(self): 501 | if self._cls is None: 502 | return [] 503 | return [name for name, func in inspect.getmembers(self._cls) 504 | if not name.startswith('_') and callable(func)] 505 | 506 | @property 507 | def properties(self): 508 | if self._cls is None: 509 | return [] 510 | return [name for name, func in inspect.getmembers(self._cls) 511 | if not name.startswith('_') and func is None] 512 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/docscrape_sphinx.py: -------------------------------------------------------------------------------- 1 | import re 2 | import inspect 3 | import textwrap 4 | import pydoc 5 | from .docscrape import NumpyDocString 6 | from .docscrape import FunctionDoc 7 | from .docscrape import ClassDoc 8 | 9 | 10 | class SphinxDocString(NumpyDocString): 11 | def __init__(self, docstring, config=None): 12 | config = {} if config is None else config 13 | self.use_plots = config.get('use_plots', False) 14 | NumpyDocString.__init__(self, docstring, config=config) 15 | 16 | # string conversion routines 17 | def _str_header(self, name, symbol='`'): 18 | return ['.. rubric:: ' + name, ''] 19 | 20 | def _str_field_list(self, name): 21 | return [':' + name + ':'] 22 | 23 | def _str_indent(self, doc, indent=4): 24 | out = [] 25 | for line in doc: 26 | out += [' ' * indent + line] 27 | return out 28 | 29 | def _str_signature(self): 30 | return [''] 31 | if self['Signature']: 32 | return ['``%s``' % self['Signature']] + [''] 33 | else: 34 | return [''] 35 | 36 | def _str_summary(self): 37 | return self['Summary'] + [''] 38 | 39 | def _str_extended_summary(self): 40 | return self['Extended Summary'] + [''] 41 | 42 | def _str_param_list(self, name): 43 | out = [] 44 | if self[name]: 45 | out += self._str_field_list(name) 46 | out += [''] 47 | for param, param_type, desc in self[name]: 48 | out += self._str_indent(['**%s** : %s' % (param.strip(), 49 | param_type)]) 50 | out += [''] 51 | out += self._str_indent(desc, 8) 52 | out += [''] 53 | return out 54 | 55 | @property 56 | def _obj(self): 57 | if hasattr(self, '_cls'): 58 | return self._cls 59 | elif hasattr(self, '_f'): 60 | return self._f 61 | return None 62 | 63 | def _str_member_list(self, name): 64 | """ 65 | Generate a member listing, autosummary:: table where possible, 66 | and a table where not. 67 | 68 | """ 69 | out = [] 70 | if self[name]: 71 | out += ['.. rubric:: %s' % name, ''] 72 | prefix = getattr(self, '_name', '') 73 | 74 | if prefix: 75 | prefix = '~%s.' % prefix 76 | 77 | autosum = [] 78 | others = [] 79 | for param, param_type, desc in self[name]: 80 | param = param.strip() 81 | if not self._obj or hasattr(self._obj, param): 82 | autosum += [" %s%s" % (prefix, param)] 83 | else: 84 | others.append((param, param_type, desc)) 85 | 86 | if autosum: 87 | # GAEL: Toctree commented out below because it creates 88 | # hundreds of sphinx warnings 89 | # out += ['.. autosummary::', ' :toctree:', ''] 90 | out += ['.. autosummary::', ''] 91 | out += autosum 92 | 93 | if others: 94 | maxlen_0 = max([len(x[0]) for x in others]) 95 | maxlen_1 = max([len(x[1]) for x in others]) 96 | hdr = "=" * maxlen_0 + " " + "=" * maxlen_1 + " " + "=" * 10 97 | fmt = '%%%ds %%%ds ' % (maxlen_0, maxlen_1) 98 | n_indent = maxlen_0 + maxlen_1 + 4 99 | out += [hdr] 100 | for param, param_type, desc in others: 101 | out += [fmt % (param.strip(), param_type)] 102 | out += self._str_indent(desc, n_indent) 103 | out += [hdr] 104 | out += [''] 105 | return out 106 | 107 | def _str_section(self, name): 108 | out = [] 109 | if self[name]: 110 | out += self._str_header(name) 111 | out += [''] 112 | content = textwrap.dedent("\n".join(self[name])).split("\n") 113 | out += content 114 | out += [''] 115 | return out 116 | 117 | def _str_see_also(self, func_role): 118 | out = [] 119 | if self['See Also']: 120 | see_also = super(SphinxDocString, self)._str_see_also(func_role) 121 | out = ['.. seealso::', ''] 122 | out += self._str_indent(see_also[2:]) 123 | return out 124 | 125 | def _str_warnings(self): 126 | out = [] 127 | if self['Warnings']: 128 | out = ['.. warning::', ''] 129 | out += self._str_indent(self['Warnings']) 130 | return out 131 | 132 | def _str_index(self): 133 | idx = self['index'] 134 | out = [] 135 | if len(idx) == 0: 136 | return out 137 | 138 | out += ['.. index:: %s' % idx.get('default', '')] 139 | for section, references in idx.iteritems(): 140 | if section == 'default': 141 | continue 142 | elif section == 'refguide': 143 | out += [' single: %s' % (', '.join(references))] 144 | else: 145 | out += [' %s: %s' % (section, ','.join(references))] 146 | return out 147 | 148 | def _str_references(self): 149 | out = [] 150 | if self['References']: 151 | out += self._str_header('References') 152 | if isinstance(self['References'], str): 153 | self['References'] = [self['References']] 154 | out.extend(self['References']) 155 | out += [''] 156 | # Latex collects all references to a separate bibliography, 157 | # so we need to insert links to it 158 | import sphinx # local import to avoid test dependency 159 | if sphinx.__version__ >= "0.6": 160 | out += ['.. only:: latex', ''] 161 | else: 162 | out += ['.. latexonly::', ''] 163 | items = [] 164 | for line in self['References']: 165 | m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) 166 | if m: 167 | items.append(m.group(1)) 168 | out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] 169 | return out 170 | 171 | def _str_examples(self): 172 | examples_str = "\n".join(self['Examples']) 173 | 174 | if (self.use_plots and 'import matplotlib' in examples_str 175 | and 'plot::' not in examples_str): 176 | out = [] 177 | out += self._str_header('Examples') 178 | out += ['.. plot::', ''] 179 | out += self._str_indent(self['Examples']) 180 | out += [''] 181 | return out 182 | else: 183 | return self._str_section('Examples') 184 | 185 | def __str__(self, indent=0, func_role="obj"): 186 | out = [] 187 | out += self._str_signature() 188 | out += self._str_index() + [''] 189 | out += self._str_summary() 190 | out += self._str_extended_summary() 191 | for param_list in ('Parameters', 'Returns', 'Raises'): 192 | out += self._str_param_list(param_list) 193 | out += self._str_warnings() 194 | out += self._str_see_also(func_role) 195 | out += self._str_section('Notes') 196 | out += self._str_references() 197 | out += self._str_examples() 198 | for param_list in ('Attributes', 'Methods'): 199 | out += self._str_member_list(param_list) 200 | out = self._str_indent(out, indent) 201 | return '\n'.join(out) 202 | 203 | 204 | class SphinxFunctionDoc(SphinxDocString, FunctionDoc): 205 | def __init__(self, obj, doc=None, config={}): 206 | self.use_plots = config.get('use_plots', False) 207 | FunctionDoc.__init__(self, obj, doc=doc, config=config) 208 | 209 | 210 | class SphinxClassDoc(SphinxDocString, ClassDoc): 211 | def __init__(self, obj, doc=None, func_doc=None, config={}): 212 | self.use_plots = config.get('use_plots', False) 213 | ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) 214 | 215 | 216 | class SphinxObjDoc(SphinxDocString): 217 | def __init__(self, obj, doc=None, config=None): 218 | self._f = obj 219 | SphinxDocString.__init__(self, doc, config=config) 220 | 221 | 222 | def get_doc_object(obj, what=None, doc=None, config={}): 223 | if what is None: 224 | if inspect.isclass(obj): 225 | what = 'class' 226 | elif inspect.ismodule(obj): 227 | what = 'module' 228 | elif callable(obj): 229 | what = 'function' 230 | else: 231 | what = 'object' 232 | if what == 'class': 233 | return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, 234 | config=config) 235 | elif what in ('function', 'method'): 236 | return SphinxFunctionDoc(obj, doc=doc, config=config) 237 | else: 238 | if doc is None: 239 | doc = pydoc.getdoc(obj) 240 | return SphinxObjDoc(obj, doc, config=config) 241 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/numpydoc.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======== 3 | numpydoc 4 | ======== 5 | 6 | Sphinx extension that handles docstrings in the Numpy standard format. [1] 7 | 8 | It will: 9 | 10 | - Convert Parameters etc. sections to field lists. 11 | - Convert See Also section to a See also entry. 12 | - Renumber references. 13 | - Extract the signature from the docstring, if it can't be determined 14 | otherwise. 15 | 16 | .. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard 17 | 18 | """ 19 | 20 | from __future__ import unicode_literals 21 | 22 | import sys # Only needed to check Python version 23 | import os 24 | import re 25 | import pydoc 26 | from .docscrape_sphinx import get_doc_object 27 | from .docscrape_sphinx import SphinxDocString 28 | import inspect 29 | 30 | 31 | def mangle_docstrings(app, what, name, obj, options, lines, 32 | reference_offset=[0]): 33 | 34 | cfg = dict(use_plots=app.config.numpydoc_use_plots, 35 | show_class_members=app.config.numpydoc_show_class_members) 36 | 37 | if what == 'module': 38 | # Strip top title 39 | title_re = re.compile(r'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*', 40 | re.I | re.S) 41 | lines[:] = title_re.sub('', "\n".join(lines)).split("\n") 42 | else: 43 | doc = get_doc_object(obj, what, "\n".join(lines), config=cfg) 44 | if sys.version_info[0] < 3: 45 | lines[:] = unicode(doc).splitlines() 46 | else: 47 | lines[:] = str(doc).splitlines() 48 | 49 | if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ 50 | obj.__name__: 51 | if hasattr(obj, '__module__'): 52 | v = dict(full_name="%s.%s" % (obj.__module__, obj.__name__)) 53 | else: 54 | v = dict(full_name=obj.__name__) 55 | lines += [u'', u'.. htmlonly::', ''] 56 | lines += [u' %s' % x for x in 57 | (app.config.numpydoc_edit_link % v).split("\n")] 58 | 59 | # replace reference numbers so that there are no duplicates 60 | references = [] 61 | for line in lines: 62 | line = line.strip() 63 | m = re.match(r'^.. \[([a-z0-9_.-])\]', line, re.I) 64 | if m: 65 | references.append(m.group(1)) 66 | 67 | # start renaming from the longest string, to avoid overwriting parts 68 | references.sort(key=lambda x: -len(x)) 69 | if references: 70 | for i, line in enumerate(lines): 71 | for r in references: 72 | if re.match(r'^\d+$', r): 73 | new_r = "R%d" % (reference_offset[0] + int(r)) 74 | else: 75 | new_r = u"%s%d" % (r, reference_offset[0]) 76 | lines[i] = lines[i].replace(u'[%s]_' % r, 77 | u'[%s]_' % new_r) 78 | lines[i] = lines[i].replace(u'.. [%s]' % r, 79 | u'.. [%s]' % new_r) 80 | 81 | reference_offset[0] += len(references) 82 | 83 | 84 | def mangle_signature(app, what, name, obj, 85 | options, sig, retann): 86 | # Do not try to inspect classes that don't define `__init__` 87 | if (inspect.isclass(obj) and 88 | (not hasattr(obj, '__init__') or 89 | 'initializes x; see ' in pydoc.getdoc(obj.__init__))): 90 | return '', '' 91 | 92 | if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')): 93 | return 94 | if not hasattr(obj, '__doc__'): 95 | return 96 | 97 | doc = SphinxDocString(pydoc.getdoc(obj)) 98 | if doc['Signature']: 99 | sig = re.sub("^[^(]*", "", doc['Signature']) 100 | return sig, '' 101 | 102 | 103 | def setup(app, get_doc_object_=get_doc_object): 104 | global get_doc_object 105 | get_doc_object = get_doc_object_ 106 | 107 | if sys.version_info[0] < 3: 108 | app.connect(b'autodoc-process-docstring', mangle_docstrings) 109 | app.connect(b'autodoc-process-signature', mangle_signature) 110 | else: 111 | app.connect('autodoc-process-docstring', mangle_docstrings) 112 | app.connect('autodoc-process-signature', mangle_signature) 113 | app.add_config_value('numpydoc_edit_link', None, False) 114 | app.add_config_value('numpydoc_use_plots', None, False) 115 | app.add_config_value('numpydoc_show_class_members', True, True) 116 | 117 | # Extra mangling domains 118 | app.add_domain(NumpyPythonDomain) 119 | app.add_domain(NumpyCDomain) 120 | 121 | #----------------------------------------------------------------------------- 122 | # Docstring-mangling domains 123 | #----------------------------------------------------------------------------- 124 | 125 | try: 126 | import sphinx # lazy to avoid test dependency 127 | except ImportError: 128 | CDomain = PythonDomain = object 129 | else: 130 | from sphinx.domains.c import CDomain 131 | from sphinx.domains.python import PythonDomain 132 | 133 | 134 | class ManglingDomainBase(object): 135 | directive_mangling_map = {} 136 | 137 | def __init__(self, *a, **kw): 138 | super(ManglingDomainBase, self).__init__(*a, **kw) 139 | self.wrap_mangling_directives() 140 | 141 | def wrap_mangling_directives(self): 142 | for name, objtype in self.directive_mangling_map.items(): 143 | self.directives[name] = wrap_mangling_directive( 144 | self.directives[name], objtype) 145 | 146 | 147 | class NumpyPythonDomain(ManglingDomainBase, PythonDomain): 148 | name = 'np' 149 | directive_mangling_map = { 150 | 'function': 'function', 151 | 'class': 'class', 152 | 'exception': 'class', 153 | 'method': 'function', 154 | 'classmethod': 'function', 155 | 'staticmethod': 'function', 156 | 'attribute': 'attribute', 157 | } 158 | 159 | 160 | class NumpyCDomain(ManglingDomainBase, CDomain): 161 | name = 'np-c' 162 | directive_mangling_map = { 163 | 'function': 'function', 164 | 'member': 'attribute', 165 | 'macro': 'function', 166 | 'type': 'class', 167 | 'var': 'object', 168 | } 169 | 170 | 171 | def wrap_mangling_directive(base_directive, objtype): 172 | class directive(base_directive): 173 | def run(self): 174 | env = self.state.document.settings.env 175 | 176 | name = None 177 | if self.arguments: 178 | m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) 179 | name = m.group(2).strip() 180 | 181 | if not name: 182 | name = self.arguments[0] 183 | 184 | lines = list(self.content) 185 | mangle_docstrings(env.app, objtype, name, None, None, lines) 186 | # local import to avoid testing dependency 187 | from docutils.statemachine import ViewList 188 | self.content = ViewList(lines, self.content.parent) 189 | 190 | return base_directive.run(self) 191 | 192 | return directive 193 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | 4 | General-purpose and introductory examples. 5 | -------------------------------------------------------------------------------- /examples/plot_randomized_output_decision_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================================= 3 | Growing tree on a randomized output space 4 | ========================================= 5 | 6 | The bottleneck of random forest on multi-label and multi-output regression 7 | tasks with many outputs is the computation of the impurity measure at 8 | each tree node for each possible split. 9 | 10 | Growing a tree on lower dimensional random output subspace allow to decrease 11 | computing time while having the same or improved performance with a sufficient 12 | number of projections. 13 | 14 | """ 15 | from __future__ import division 16 | from time import time 17 | 18 | import numpy as np 19 | import matplotlib.pyplot as plt 20 | 21 | from sklearn.base import clone 22 | from sklearn.cross_validation import train_test_split 23 | from sklearn.random_projection import SparseRandomProjection 24 | from sklearn.metrics import label_ranking_average_precision_score as lrap_score 25 | 26 | from random_output_trees.datasets import fetch_drug_interaction 27 | from random_output_trees.ensemble import RandomForestClassifier 28 | 29 | random_state = np.random.RandomState(0) 30 | 31 | # Let's load a multilabel dataset 32 | dataset = fetch_drug_interaction() 33 | X = dataset.data 34 | y = dataset.target # y.shape = (1862, 1554) 35 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, 36 | random_state=0) 37 | n_outputs = y.shape[1] 38 | 39 | 40 | def benchmark(base_estimator, random_state=None, n_iter=3): 41 | scores = [] 42 | times = [] 43 | for iter_ in range(n_iter): 44 | estimator = clone(base_estimator) 45 | estimator.set_params(random_state=random_state) 46 | 47 | time_start = time() 48 | estimator.fit(X_train, y_train) 49 | times.append(time() - time_start) 50 | 51 | y_proba_pred = estimator.predict_proba(X_test) 52 | y_scores = 1 - np.vstack([p[:, 0] for p in y_proba_pred]).T 53 | scores.append(lrap_score(y_test, y_scores)) 54 | 55 | return scores, times 56 | 57 | 58 | # NB: Increase the number of estimators to improve performance 59 | n_estimators = 20 60 | 61 | # Let's learn a random forest model 62 | rf = RandomForestClassifier(n_estimators=n_estimators, 63 | random_state=0) 64 | rf_score, rf_times = benchmark(rf, random_state) 65 | 66 | rf_score_mean = np.mean(rf_score) 67 | rf_score_std = np.std(rf_score) 68 | 69 | rf_times_mean = np.mean(rf_times) 70 | rf_times_std = np.std(rf_times) 71 | 72 | # Let's learn random forest on a Gaussian subspace 73 | all_n_components = np.ceil(np.array([1, 5, 10, 50, 100])) 74 | all_n_components = all_n_components.astype(int) 75 | scores_mean = [] 76 | scores_std = [] 77 | times_mean = [] 78 | times_std = [] 79 | 80 | for i, n_components in enumerate(all_n_components): 81 | # First instatiate a transformer to modify the output space 82 | output_transformer = SparseRandomProjection(n_components=n_components, 83 | random_state=0) 84 | 85 | # To fix the random output space for each estimator 86 | # Uncomment the following lines 87 | # from random_output_trees.transformer import FixedStateTransformer 88 | # output_transformer = FixedStateTransformer(output_transformer, 89 | # random_seed=0) 90 | 91 | # Let's learn random forest on randomized subspace 92 | gaussian_rf = RandomForestClassifier(n_estimators=n_estimators, 93 | output_transformer=output_transformer, 94 | random_state=0) 95 | 96 | scores, times = benchmark(gaussian_rf, random_state) 97 | scores_mean.append(np.mean(scores)) 98 | scores_std.append(np.std(scores)) 99 | times_mean.append(np.mean(times)) 100 | times_std.append(np.std(times)) 101 | 102 | scores_mean = np.array(scores_mean) 103 | scores_std = np.array(scores_std) 104 | times_mean = np.array(times_mean) 105 | times_std = np.array(times_std) 106 | 107 | # Let's plot the outcome of the experiments 108 | fraction_outputs = all_n_components / n_outputs 109 | 110 | plt.figure() 111 | plt.plot(fraction_outputs, rf_score_mean * np.ones_like(fraction_outputs), 112 | "-o", color='r', label="Original output space") 113 | plt.fill_between(fraction_outputs, 114 | rf_score_mean - rf_score_std, 115 | rf_score_mean + rf_score_std, alpha=0.25, color="r") 116 | plt.plot(fraction_outputs, scores_mean, "-o", color='g', 117 | label="Sparse rademacher output subspace") 118 | plt.fill_between(fraction_outputs, 119 | scores_mean - scores_std, 120 | scores_mean + scores_std, alpha=0.25, color="g") 121 | plt.legend(loc="best") 122 | plt.xlabel("n_components / n_outputs") 123 | plt.ylabel("Label ranking average precision") 124 | plt.show() 125 | 126 | 127 | plt.figure() 128 | plt.plot(fraction_outputs, rf_times_mean * np.ones_like(fraction_outputs), 129 | "-o", color='r', label="Original output space") 130 | plt.fill_between(fraction_outputs, 131 | rf_times_mean - rf_times_std, 132 | rf_times_mean + rf_times_std, alpha=0.25, color="r") 133 | plt.plot(fraction_outputs, times_mean, "-o", color='g', 134 | label="Sparse rademacher output subspace") 135 | plt.fill_between(fraction_outputs, 136 | times_mean - times_std, 137 | times_mean + times_std, alpha=0.25, color="g") 138 | plt.legend(loc="best") 139 | plt.ylim((0., max(np.max(times_mean + times_std), 140 | rf_times_mean + rf_times_std) * 1.1)) 141 | plt.xlabel("n_components / n_outputs") 142 | plt.ylabel("Time [s]") 143 | plt.show() 144 | -------------------------------------------------------------------------------- /examples/plot_variance_preservation.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================== 3 | Variance is preserved under random projections 4 | ============================================== 5 | 6 | If a random projection matrix satisfies the Johson-Lindenstrauss, then 7 | the variance computed on the randomly projected output space is equal to 8 | the variance on the original output space up to an epsilon factor. 9 | 10 | This is an illustration of Theorem 1 from the paper 11 | 12 | Joly, A., Geurts, P., & Wehenkel, L. (2014). Random forests with 13 | random projections of the output space for high dimensional multi-label 14 | classification. ECML-PKDD 2014, Nancy, France 15 | 16 | """ 17 | from __future__ import division 18 | import numpy as np 19 | 20 | import matplotlib.pyplot as plt 21 | from sklearn.random_projection import GaussianRandomProjection 22 | 23 | random_state = np.random.RandomState(0) 24 | 25 | # Let's first generate a set of samples 26 | n_samples = 2000 27 | n_outputs = 500 28 | X = 3 + 5 * random_state.normal(size=(n_samples, n_outputs)) 29 | 30 | # Let's compute the sum of the variance in the orignal output space 31 | var_origin = np.var(X, axis=0).sum() 32 | 33 | # Let's compute the variance on a random subspace 34 | all_n_components = np.array([1, 50, 100, 200, 400, 500]) 35 | n_repetitions = 10 36 | distortion = np.empty((len(all_n_components), n_repetitions)) 37 | 38 | for i, n_components in enumerate(all_n_components): 39 | for j in range(n_repetitions): 40 | transformer = GaussianRandomProjection(n_components=n_components, 41 | random_state=random_state) 42 | X_subspace = transformer.fit_transform(X) 43 | distortion[i, j] = np.var(X_subspace, axis=0).sum() / var_origin 44 | 45 | # Let's plot the distortion as a function of the compression ratio 46 | distortion_mean = distortion.mean(axis=1) 47 | distortion_std = distortion.std(axis=1) 48 | 49 | plt.figure() 50 | plt.plot(all_n_components / n_outputs, distortion_mean, "o-", color="g") 51 | plt.plot(all_n_components / n_outputs, np.ones_like(distortion_mean), 52 | "--", color="r") 53 | plt.fill_between(all_n_components / n_outputs, 54 | distortion_mean - distortion_std, 55 | distortion_mean + distortion_std, alpha=0.25, color="g") 56 | plt.xlabel("n_components / n_outputs") 57 | plt.ylabel('Distortion of the variance on a Gaussian subspace') 58 | plt.show() 59 | -------------------------------------------------------------------------------- /random_output_trees/__init__.py: -------------------------------------------------------------------------------- 1 | # Author : Arnaud Joly 2 | # 3 | # License: BSD 3 clause 4 | 5 | __version__ = "dev" 6 | 7 | -------------------------------------------------------------------------------- /random_output_trees/_sklearn_tree.pxd: -------------------------------------------------------------------------------- 1 | # Authors: Gilles Louppe 2 | # Peter Prettenhofer 3 | # Brian Holt 4 | # Joel Nothman 5 | # Arnaud Joly 6 | # 7 | # Licence: BSD 3 clause 8 | 9 | # See _tree.pyx for details. 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | ctypedef np.npy_float32 DTYPE_t # Type of X 15 | ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight 16 | ctypedef np.npy_intp SIZE_t # Type for indices and counters 17 | ctypedef np.npy_int32 INT32_t # Signed 32 bit integer 18 | ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer 19 | 20 | 21 | # ============================================================================= 22 | # Criterion 23 | # ============================================================================= 24 | 25 | cdef class Criterion: 26 | # The criterion computes the impurity of a node and the reduction of 27 | # impurity of a split on that node. It also computes the output statistics 28 | # such as the mean in regression and class probabilities in classification. 29 | 30 | # Internal structures 31 | cdef DOUBLE_t* y # Values of y 32 | cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1) 33 | cdef DOUBLE_t* sample_weight # Sample weights 34 | 35 | cdef SIZE_t* samples # Sample indices in X, y 36 | cdef SIZE_t start # samples[start:pos] are the samples in the left node 37 | cdef SIZE_t pos # samples[pos:end] are the samples in the right node 38 | cdef SIZE_t end 39 | 40 | cdef SIZE_t n_outputs # Number of outputs 41 | cdef SIZE_t n_node_samples # Number of samples in the node (end-start) 42 | cdef double weighted_n_samples # Weighted number of samples (in total) 43 | cdef double weighted_n_node_samples # Weighted number of samples in the node 44 | cdef double weighted_n_left # Weighted number of samples in the left node 45 | cdef double weighted_n_right # Weighted number of samples in the right node 46 | 47 | # The criterion object is maintained such that left and right collected 48 | # statistics correspond to samples[start:pos] and samples[pos:end]. 49 | 50 | # Methods 51 | cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, 52 | double weighted_n_samples, SIZE_t* samples, SIZE_t start, 53 | SIZE_t end) nogil 54 | cdef void reset(self) nogil 55 | cdef void update(self, SIZE_t new_pos) nogil 56 | cdef double node_impurity(self) nogil 57 | cdef void children_impurity(self, double* impurity_left, 58 | double* impurity_right) nogil 59 | cdef void node_value(self, double* dest) nogil 60 | cdef double impurity_improvement(self, double impurity) nogil 61 | 62 | 63 | # ============================================================================= 64 | # Splitter 65 | # ============================================================================= 66 | 67 | cdef struct SplitRecord: 68 | # Data to track sample split 69 | SIZE_t feature # Which feature to split on. 70 | SIZE_t pos # Split samples array at the given position, 71 | # i.e. count of samples below threshold for feature. 72 | # pos is >= end if the node is a leaf. 73 | double threshold # Threshold to split at. 74 | double improvement # Impurity improvement given parent node. 75 | double impurity_left # Impurity of the left split. 76 | double impurity_right # Impurity of the right split. 77 | 78 | 79 | cdef class Splitter: 80 | # The splitter searches in the input space for a feature and a threshold 81 | # to split the samples samples[start:end]. 82 | # 83 | # The impurity computations are delegated to a criterion object. 84 | 85 | # Internal structures 86 | cdef public Criterion criterion # Impurity criterion 87 | cdef public SIZE_t max_features # Number of features to test 88 | cdef public SIZE_t min_samples_leaf # Min samples in a leaf 89 | cdef public double min_weight_leaf # Minimum weight in a leaf 90 | 91 | cdef object random_state # Random state 92 | cdef UINT32_t rand_r_state # sklearn_rand_r random number state 93 | 94 | cdef SIZE_t* samples # Sample indices in X, y 95 | cdef SIZE_t n_samples # X.shape[0] 96 | cdef double weighted_n_samples # Weighted number of samples 97 | cdef SIZE_t* features # Feature indices in X 98 | cdef SIZE_t* constant_features # Constant features indices 99 | cdef SIZE_t n_features # X.shape[1] 100 | cdef DTYPE_t* feature_values # temp. array holding feature values 101 | cdef SIZE_t start # Start position for the current node 102 | cdef SIZE_t end # End position for the current node 103 | 104 | cdef DTYPE_t* X 105 | cdef SIZE_t X_sample_stride 106 | cdef SIZE_t X_fx_stride 107 | cdef DOUBLE_t* y 108 | cdef SIZE_t y_stride 109 | cdef DOUBLE_t* sample_weight 110 | 111 | # The samples vector `samples` is maintained by the Splitter object such 112 | # that the samples contained in a node are contiguous. With this setting, 113 | # `node_split` reorganizes the node samples `samples[start:end]` in two 114 | # subsets `samples[start:pos]` and `samples[pos:end]`. 115 | 116 | # The 1-d `features` array of size n_features contains the features 117 | # indices and allows fast sampling without replacement of features. 118 | 119 | # The 1-d `constant_features` array of size n_features holds in 120 | # `constant_features[:n_constant_features]` the feature ids with 121 | # constant values for all the samples that reached a specific node. 122 | # The value `n_constant_features` is given by the the parent node to its 123 | # child nodes. The content of the range `[n_constant_features:]` is left 124 | # undefined, but preallocated for performance reasons 125 | # This allows optimization with depth-based tree building. 126 | 127 | # Methods 128 | cdef void init(self, np.ndarray X, np.ndarray y, DOUBLE_t* sample_weight) 129 | 130 | cdef void node_reset(self, SIZE_t start, SIZE_t end, 131 | double* weighted_n_node_samples) nogil 132 | 133 | cdef void node_split(self, 134 | double impurity, # Impurity of the node 135 | SplitRecord* split, 136 | SIZE_t* n_constant_features) nogil 137 | 138 | cdef void node_value(self, double* dest) nogil 139 | 140 | cdef double node_impurity(self) nogil 141 | 142 | 143 | # ============================================================================= 144 | # Tree 145 | # ============================================================================= 146 | 147 | cdef struct Node: 148 | # Base storage structure for the nodes in a Tree object 149 | 150 | SIZE_t left_child # id of the left child of the node 151 | SIZE_t right_child # id of the right child of the node 152 | SIZE_t feature # Feature used for splitting the node 153 | DOUBLE_t threshold # Threshold value at the node 154 | DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion) 155 | SIZE_t n_node_samples # Number of samples at the node 156 | DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node 157 | 158 | cdef class Tree: 159 | # The Tree object is a binary tree structure constructed by the 160 | # TreeBuilder. The tree structure is used for predictions and 161 | # feature importances. 162 | 163 | # Input/Output layout 164 | cdef public SIZE_t n_features # Number of features in X 165 | cdef SIZE_t* n_classes # Number of classes in y[:, k] 166 | cdef public SIZE_t n_outputs # Number of outputs in y 167 | cdef public SIZE_t max_n_classes # max(n_classes) 168 | 169 | # Inner structures: values are stored separately from node structure, 170 | # since size is determined at runtime. 171 | cdef public SIZE_t max_depth # Max depth of the tree 172 | cdef public SIZE_t node_count # Counter for node IDs 173 | cdef public SIZE_t capacity # Capacity of tree, in terms of nodes 174 | cdef Node* nodes # Array of nodes 175 | cdef double* value # (capacity, n_outputs, max_n_classes) array of values 176 | cdef SIZE_t value_stride # = n_outputs * max_n_classes 177 | 178 | # Methods 179 | cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, 180 | SIZE_t feature, double threshold, double impurity, 181 | SIZE_t n_node_samples, 182 | double weighted_n_samples) nogil 183 | cdef void _resize(self, SIZE_t capacity) 184 | cdef int _resize_c(self, SIZE_t capacity=*) nogil 185 | 186 | cdef np.ndarray _get_value_ndarray(self) 187 | cdef np.ndarray _get_node_ndarray(self) 188 | 189 | cpdef np.ndarray predict(self, np.ndarray[DTYPE_t, ndim=2] X) 190 | cpdef np.ndarray apply(self, np.ndarray[DTYPE_t, ndim=2] X) 191 | cpdef compute_feature_importances(self, normalize=*) 192 | 193 | 194 | # ============================================================================= 195 | # Tree builder 196 | # ============================================================================= 197 | 198 | cdef class TreeBuilder: 199 | # The TreeBuilder recursively builds a Tree object from training samples, 200 | # using a Splitter object for splitting internal nodes and assigning 201 | # values to leaves. 202 | # 203 | # This class controls the various stopping criteria and the node splitting 204 | # evaluation order, e.g. depth-first or best-first. 205 | 206 | cdef Splitter splitter # Splitting algorithm 207 | 208 | cdef SIZE_t min_samples_split # Minimum number of samples in an internal node 209 | cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf 210 | cdef double min_weight_leaf # Minimum weight in a leaf 211 | cdef SIZE_t max_depth # Maximal tree depth 212 | 213 | cpdef build(self, Tree tree, np.ndarray X, np.ndarray y, 214 | np.ndarray sample_weight=*) 215 | -------------------------------------------------------------------------------- /random_output_trees/_sklearn_tree_utils.pxd: -------------------------------------------------------------------------------- 1 | # Authors: Gilles Louppe 2 | # Peter Prettenhofer 3 | # Arnaud Joly 4 | # 5 | # Licence: BSD 3 clause 6 | 7 | # See _utils.pyx for details. 8 | 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | ctypedef np.npy_intp SIZE_t # Type for indices and counters 13 | 14 | 15 | # ============================================================================= 16 | # Stack data structure 17 | # ============================================================================= 18 | 19 | # A record on the stack for depth-first tree growing 20 | cdef struct StackRecord: 21 | SIZE_t start 22 | SIZE_t end 23 | SIZE_t depth 24 | SIZE_t parent 25 | bint is_left 26 | double impurity 27 | SIZE_t n_constant_features 28 | 29 | cdef class Stack: 30 | cdef SIZE_t capacity 31 | cdef SIZE_t top 32 | cdef StackRecord* stack_ 33 | 34 | cdef bint is_empty(self) nogil 35 | cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent, 36 | bint is_left, double impurity, 37 | SIZE_t n_constant_features) nogil 38 | cdef int pop(self, StackRecord* res) nogil 39 | 40 | 41 | # ============================================================================= 42 | # PriorityHeap data structure 43 | # ============================================================================= 44 | 45 | # A record on the frontier for best-first tree growing 46 | cdef struct PriorityHeapRecord: 47 | SIZE_t node_id 48 | SIZE_t start 49 | SIZE_t end 50 | SIZE_t pos 51 | SIZE_t depth 52 | bint is_leaf 53 | double impurity 54 | double impurity_left 55 | double impurity_right 56 | double improvement 57 | 58 | cdef class PriorityHeap: 59 | cdef SIZE_t capacity 60 | cdef SIZE_t heap_ptr 61 | cdef PriorityHeapRecord* heap_ 62 | 63 | cdef bint is_empty(self) nogil 64 | cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, 65 | SIZE_t depth, bint is_leaf, double improvement, 66 | double impurity, double impurity_left, 67 | double impurity_right) nogil 68 | cdef int pop(self, PriorityHeapRecord* res) nogil 69 | -------------------------------------------------------------------------------- /random_output_trees/_sklearn_tree_utils.pyx: -------------------------------------------------------------------------------- 1 | # cython: cdivision=True 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | # Authors: Gilles Louppe 6 | # Peter Prettenhofer 7 | # Arnaud Joly 8 | # 9 | # Licence: BSD 3 clause 10 | 11 | from libc.stdlib cimport free, malloc, realloc 12 | 13 | 14 | # This file is taken from scikit-learn to allow easy installation 15 | # and not to rely on a specific version of scikit-learn decision tree 16 | # implementation 17 | 18 | # ============================================================================= 19 | # Stack data structure 20 | # ============================================================================= 21 | 22 | cdef class Stack: 23 | """A LIFO data structure. 24 | 25 | Attributes 26 | ---------- 27 | capacity : SIZE_t 28 | The elements the stack can hold; if more added then ``self.stack_`` 29 | needs to be resized. 30 | 31 | top : SIZE_t 32 | The number of elements currently on the stack. 33 | 34 | stack : StackRecord pointer 35 | The stack of records (upward in the stack corresponds to the right). 36 | """ 37 | 38 | def __cinit__(self, SIZE_t capacity): 39 | self.capacity = capacity 40 | self.top = 0 41 | self.stack_ = malloc(capacity * sizeof(StackRecord)) 42 | if self.stack_ == NULL: 43 | raise MemoryError() 44 | 45 | def __dealloc__(self): 46 | free(self.stack_) 47 | 48 | cdef bint is_empty(self) nogil: 49 | return self.top <= 0 50 | 51 | cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent, 52 | bint is_left, double impurity, 53 | SIZE_t n_constant_features) nogil: 54 | """Push a new element onto the stack. 55 | 56 | Returns 0 if successful; -1 on out of memory error. 57 | """ 58 | cdef SIZE_t top = self.top 59 | cdef StackRecord* stack = NULL 60 | 61 | # Resize if capacity not sufficient 62 | if top >= self.capacity: 63 | self.capacity *= 2 64 | stack = realloc(self.stack_, 65 | self.capacity * sizeof(StackRecord)) 66 | if stack == NULL: 67 | # no free; __dealloc__ handles that 68 | return -1 69 | self.stack_ = stack 70 | 71 | stack = self.stack_ 72 | stack[top].start = start 73 | stack[top].end = end 74 | stack[top].depth = depth 75 | stack[top].parent = parent 76 | stack[top].is_left = is_left 77 | stack[top].impurity = impurity 78 | stack[top].n_constant_features = n_constant_features 79 | 80 | # Increment stack pointer 81 | self.top = top + 1 82 | return 0 83 | 84 | cdef int pop(self, StackRecord* res) nogil: 85 | """Remove the top element from the stack and copy to ``res``. 86 | 87 | Returns 0 if pop was successful (and ``res`` is set); -1 88 | otherwise. 89 | """ 90 | cdef SIZE_t top = self.top 91 | cdef StackRecord* stack = self.stack_ 92 | 93 | if top <= 0: 94 | return -1 95 | 96 | res[0] = stack[top - 1] 97 | self.top = top - 1 98 | 99 | return 0 100 | 101 | 102 | # ============================================================================= 103 | # PriorityHeap data structure 104 | # ============================================================================= 105 | 106 | cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil: 107 | """Restore heap invariant parent.improvement > child.improvement from 108 | ``pos`` upwards. """ 109 | if pos == 0: 110 | return 111 | 112 | cdef SIZE_t parent_pos = (pos - 1) / 2 113 | 114 | if heap[parent_pos].improvement < heap[pos].improvement: 115 | heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] 116 | heapify_up(heap, parent_pos) 117 | 118 | 119 | cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos, 120 | SIZE_t heap_length) nogil: 121 | """Restore heap invariant parent.improvement > children.improvement from 122 | ``pos`` downwards. """ 123 | cdef SIZE_t left_pos = 2 * (pos + 1) - 1 124 | cdef SIZE_t right_pos = 2 * (pos + 1) 125 | cdef SIZE_t largest = pos 126 | 127 | if (left_pos < heap_length and 128 | heap[left_pos].improvement > heap[largest].improvement): 129 | largest = left_pos 130 | 131 | if (right_pos < heap_length and 132 | heap[right_pos].improvement > heap[largest].improvement): 133 | largest = right_pos 134 | 135 | if largest != pos: 136 | heap[pos], heap[largest] = heap[largest], heap[pos] 137 | heapify_down(heap, largest, heap_length) 138 | 139 | 140 | cdef class PriorityHeap: 141 | """A priority queue implemented as a binary heap. 142 | 143 | The heap invariant is that the impurity improvement of the parent record 144 | is larger then the impurity improvement of the children. 145 | 146 | Attributes 147 | ---------- 148 | capacity : SIZE_t 149 | The capacity of the heap 150 | 151 | heap_ptr : SIZE_t 152 | The water mark of the heap; the heap grows from left to right in the 153 | array ``heap_``. The following invariant holds ``heap_ptr < capacity``. 154 | 155 | heap_ : PriorityHeapRecord* 156 | The array of heap records. The maximum element is on the left; 157 | the heap grows from left to right 158 | """ 159 | 160 | def __cinit__(self, SIZE_t capacity): 161 | self.capacity = capacity 162 | self.heap_ptr = 0 163 | self.heap_ = malloc(capacity * sizeof(PriorityHeapRecord)) 164 | if self.heap_ == NULL: 165 | raise MemoryError() 166 | 167 | def __dealloc__(self): 168 | free(self.heap_) 169 | 170 | cdef bint is_empty(self) nogil: 171 | return self.heap_ptr <= 0 172 | 173 | cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, 174 | SIZE_t depth, bint is_leaf, double improvement, 175 | double impurity, double impurity_left, 176 | double impurity_right) nogil: 177 | """Push record on the priority heap. 178 | 179 | Returns 0 if successful; -1 on out of memory error. 180 | """ 181 | cdef SIZE_t heap_ptr = self.heap_ptr 182 | cdef PriorityHeapRecord* heap = NULL 183 | 184 | # Resize if capacity not sufficient 185 | if heap_ptr >= self.capacity: 186 | self.capacity *= 2 187 | heap = realloc(self.heap_, 188 | self.capacity * 189 | sizeof(PriorityHeapRecord)) 190 | if heap == NULL: 191 | # no free; __dealloc__ handles that 192 | return -1 193 | self.heap_ = heap 194 | 195 | # Put element as last element of heap 196 | heap = self.heap_ 197 | heap[heap_ptr].node_id = node_id 198 | heap[heap_ptr].start = start 199 | heap[heap_ptr].end = end 200 | heap[heap_ptr].pos = pos 201 | heap[heap_ptr].depth = depth 202 | heap[heap_ptr].is_leaf = is_leaf 203 | heap[heap_ptr].impurity = impurity 204 | heap[heap_ptr].impurity_left = impurity_left 205 | heap[heap_ptr].impurity_right = impurity_right 206 | heap[heap_ptr].improvement = improvement 207 | 208 | # Heapify up 209 | heapify_up(heap, heap_ptr) 210 | 211 | # Increase element count 212 | self.heap_ptr = heap_ptr + 1 213 | return 0 214 | 215 | cdef int pop(self, PriorityHeapRecord* res) nogil: 216 | """Remove max element from the heap. """ 217 | cdef SIZE_t heap_ptr = self.heap_ptr 218 | cdef PriorityHeapRecord* heap = self.heap_ 219 | 220 | if heap_ptr <= 0: 221 | return -1 222 | 223 | # Take first element 224 | res[0] = heap[0] 225 | 226 | # Put last element to the front 227 | heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0] 228 | 229 | # Restore heap invariant 230 | if heap_ptr > 1: 231 | heapify_down(heap, 0, heap_ptr - 1) 232 | 233 | self.heap_ptr = heap_ptr - 1 234 | 235 | return 0 236 | -------------------------------------------------------------------------------- /random_output_trees/_tree.pyx: -------------------------------------------------------------------------------- 1 | # cython: cdivision=True 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | # Authors: Arnaud Joly 6 | # 7 | # Licence: BSD 3 clause 8 | 9 | from libc.stdlib cimport calloc, free, malloc, realloc 10 | from libc.string cimport memcpy, memset 11 | from libc.math cimport log as ln 12 | from libc.math cimport floor 13 | 14 | 15 | 16 | import numpy as np 17 | cimport numpy as np 18 | np.import_array() 19 | 20 | 21 | # ============================================================================= 22 | # Types and constants 23 | # ============================================================================= 24 | 25 | from numpy import float32 as DTYPE 26 | from numpy import float64 as DOUBLE 27 | 28 | 29 | # ============================================================================= 30 | # Scikit-learn import 31 | # ============================================================================= 32 | 33 | # Criterion 34 | from ._sklearn_tree import Criterion 35 | from ._sklearn_tree cimport Criterion 36 | from ._sklearn_tree import MSE 37 | 38 | from ._sklearn_tree import Splitter 39 | from ._sklearn_tree cimport Splitter 40 | from ._sklearn_tree cimport SplitRecord 41 | 42 | from ._sklearn_tree cimport SIZE_t 43 | from ._sklearn_tree cimport DOUBLE_t 44 | from ._sklearn_tree cimport DTYPE_t 45 | from ._sklearn_tree cimport UINT32_t 46 | 47 | 48 | cdef inline np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size): 49 | """Encapsulate data into a 1D numpy array of intp's.""" 50 | cdef np.npy_intp shape[1] 51 | shape[0] = size 52 | return np.PyArray_SimpleNewFromData(1, shape, np.NPY_INTP, data) 53 | 54 | 55 | # ============================================================================= 56 | # Custom Criterion 57 | # ============================================================================= 58 | 59 | cdef class VarianceCriterion(Criterion): 60 | """Abstract criterion for regression. 61 | 62 | Computes variance of the target values left and right of the split point. 63 | Computation is linear in `n_samples` by using :: 64 | 65 | var = \sum_i^n (y_i - y_bar) ** 2 66 | = (\sum_i^n y_i ** 2) - n_samples y_bar ** 2 67 | """ 68 | cdef double* mean_left 69 | cdef double* mean_right 70 | cdef double* mean_total 71 | cdef double* sq_sum_left 72 | cdef double* sq_sum_right 73 | cdef double* sq_sum_total 74 | cdef double* var_left 75 | cdef double* var_right 76 | 77 | cdef double* sum_left 78 | cdef double* sum_right 79 | cdef double* sum_total 80 | 81 | cdef SIZE_t* n_classes 82 | cdef SIZE_t label_count_stride 83 | 84 | def __cinit__(self, SIZE_t n_outputs, np.ndarray[SIZE_t, ndim=1] n_classes): 85 | # Default values 86 | self.y = NULL 87 | self.y_stride = 0 88 | self.sample_weight = NULL 89 | 90 | self.samples = NULL 91 | self.start = 0 92 | self.pos = 0 93 | self.end = 0 94 | 95 | self.n_outputs = n_outputs 96 | self.n_node_samples = 0 97 | self.weighted_n_node_samples = 0.0 98 | self.weighted_n_left = 0.0 99 | self.weighted_n_right = 0.0 100 | 101 | # Allocate accumulators 102 | self.mean_left = calloc(n_outputs, sizeof(double)) 103 | self.mean_right = calloc(n_outputs, sizeof(double)) 104 | self.mean_total = calloc(n_outputs, sizeof(double)) 105 | self.sq_sum_left = calloc(n_outputs, sizeof(double)) 106 | self.sq_sum_right = calloc(n_outputs, sizeof(double)) 107 | self.sq_sum_total = calloc(n_outputs, sizeof(double)) 108 | self.var_left = calloc(n_outputs, sizeof(double)) 109 | self.var_right = calloc(n_outputs, sizeof(double)) 110 | 111 | self.sum_left = calloc(n_outputs, sizeof(double)) 112 | self.sum_right = calloc(n_outputs, sizeof(double)) 113 | self.sum_total = calloc(n_outputs, sizeof(double)) 114 | 115 | # Check for allocation errors 116 | if (self.mean_left == NULL or 117 | self.mean_right == NULL or 118 | self.mean_total == NULL or 119 | self.sq_sum_left == NULL or 120 | self.sq_sum_right == NULL or 121 | self.sq_sum_total == NULL or 122 | self.var_left == NULL or 123 | self.var_right == NULL or 124 | self.sum_left == NULL or 125 | self.sum_right == NULL or 126 | self.sum_total == NULL): 127 | raise MemoryError() 128 | 129 | 130 | # Count labels for each output 131 | self.n_classes = malloc(n_outputs * sizeof(SIZE_t)) 132 | if self.n_classes == NULL: 133 | raise MemoryError() 134 | 135 | cdef SIZE_t k = 0 136 | cdef SIZE_t label_count_stride = 0 137 | 138 | for k in range(n_outputs): 139 | self.n_classes[k] = n_classes[k] 140 | 141 | if n_classes[k] > label_count_stride: 142 | label_count_stride = n_classes[k] 143 | 144 | if n_classes[k] > 2: 145 | raise ValueError("Implementation limited to binary " 146 | "classification") 147 | 148 | self.label_count_stride = label_count_stride 149 | 150 | 151 | def __dealloc__(self): 152 | """Destructor.""" 153 | free(self.mean_left) 154 | free(self.mean_right) 155 | free(self.mean_total) 156 | free(self.sq_sum_left) 157 | free(self.sq_sum_right) 158 | free(self.sq_sum_total) 159 | free(self.var_left) 160 | free(self.var_right) 161 | 162 | free(self.sum_left) 163 | free(self.sum_right) 164 | free(self.sum_total) 165 | 166 | free(self.n_classes) 167 | 168 | def __reduce__(self): 169 | return (VarianceCriterion, 170 | (self.n_outputs, 171 | sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)), 172 | self.__getstate__()) 173 | 174 | 175 | def __getstate__(self): 176 | return {} 177 | 178 | def __setstate__(self, d): 179 | pass 180 | 181 | cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, 182 | double weighted_n_samples, SIZE_t* samples, SIZE_t start, 183 | SIZE_t end) nogil: 184 | """Initialize the criterion at node samples[start:end] and 185 | children samples[start:start] and samples[start:end].""" 186 | self.y = y 187 | self.y_stride = y_stride 188 | self.sample_weight = sample_weight 189 | self.samples = samples 190 | self.start = start 191 | self.end = end 192 | self.n_node_samples = end - start 193 | self.weighted_n_samples = weighted_n_samples 194 | cdef double weighted_n_node_samples = 0. 195 | 196 | # Initialize accumulators 197 | cdef SIZE_t n_outputs = self.n_outputs 198 | cdef double* mean_left = self.mean_left 199 | cdef double* mean_right = self.mean_right 200 | cdef double* mean_total = self.mean_total 201 | cdef double* sq_sum_left = self.sq_sum_left 202 | cdef double* sq_sum_right = self.sq_sum_right 203 | cdef double* sq_sum_total = self.sq_sum_total 204 | cdef double* var_left = self.var_left 205 | cdef double* var_right = self.var_right 206 | cdef double* sum_left = self.sum_left 207 | cdef double* sum_right = self.sum_right 208 | cdef double* sum_total = self.sum_total 209 | 210 | cdef SIZE_t i = 0 211 | cdef SIZE_t p = 0 212 | cdef SIZE_t k = 0 213 | cdef DOUBLE_t y_ik = 0.0 214 | cdef DOUBLE_t w_y_ik = 0.0 215 | cdef DOUBLE_t w = 1.0 216 | 217 | cdef SIZE_t n_bytes = n_outputs * sizeof(double) 218 | memset(mean_left, 0, n_bytes) 219 | memset(mean_right, 0, n_bytes) 220 | memset(mean_total, 0, n_bytes) 221 | memset(sq_sum_left, 0, n_bytes) 222 | memset(sq_sum_right, 0, n_bytes) 223 | memset(sq_sum_total, 0, n_bytes) 224 | memset(var_left, 0, n_bytes) 225 | memset(var_right, 0, n_bytes) 226 | memset(sum_left, 0, n_bytes) 227 | memset(sum_right, 0, n_bytes) 228 | memset(sum_total, 0, n_bytes) 229 | 230 | for p in range(start, end): 231 | i = samples[p] 232 | 233 | if sample_weight != NULL: 234 | w = sample_weight[i] 235 | 236 | for k in range(n_outputs): 237 | y_ik = y[i * y_stride + k] 238 | w_y_ik = w * y_ik 239 | sum_total[k] += w_y_ik 240 | sq_sum_total[k] += w_y_ik * y_ik 241 | 242 | weighted_n_node_samples += w 243 | 244 | self.weighted_n_node_samples = weighted_n_node_samples 245 | 246 | for k in range(n_outputs): 247 | mean_total[k] = sum_total[k] / weighted_n_node_samples 248 | 249 | # Reset to pos=start 250 | self.reset() 251 | 252 | cdef void reset(self) nogil: 253 | """Reset the criterion at pos=start.""" 254 | self.pos = self.start 255 | 256 | self.weighted_n_left = 0.0 257 | self.weighted_n_right = self.weighted_n_node_samples 258 | cdef double weighted_n_right = self.weighted_n_right 259 | 260 | 261 | cdef SIZE_t n_outputs = self.n_outputs 262 | cdef double* mean_left = self.mean_left 263 | cdef double* mean_right = self.mean_right 264 | cdef double* mean_total = self.mean_total 265 | cdef double* sq_sum_left = self.sq_sum_left 266 | cdef double* sq_sum_right = self.sq_sum_right 267 | cdef double* sq_sum_total = self.sq_sum_total 268 | cdef double* var_left = self.var_left 269 | cdef double* var_right = self.var_right 270 | cdef double* sum_left = self.sum_left 271 | cdef double* sum_right = self.sum_right 272 | cdef double* sum_total = self.sum_total 273 | 274 | cdef SIZE_t k = 0 275 | 276 | for k in range(n_outputs): 277 | mean_right[k] = mean_total[k] 278 | mean_left[k] = 0.0 279 | sq_sum_right[k] = sq_sum_total[k] 280 | sq_sum_left[k] = 0.0 281 | var_right[k] = (sq_sum_right[k] / weighted_n_right - 282 | mean_right[k] * mean_right[k]) 283 | var_left[k] = 0.0 284 | sum_right[k] = sum_total[k] 285 | sum_left[k] = 0.0 286 | 287 | 288 | cdef void update(self, SIZE_t new_pos) nogil: 289 | """Update the collected statistics by moving samples[pos:new_pos] from 290 | the right child to the left child.""" 291 | cdef DOUBLE_t* y = self.y 292 | cdef SIZE_t y_stride = self.y_stride 293 | cdef DOUBLE_t* sample_weight = self.sample_weight 294 | 295 | cdef SIZE_t* samples = self.samples 296 | cdef SIZE_t pos = self.pos 297 | 298 | cdef SIZE_t n_outputs = self.n_outputs 299 | cdef double* mean_left = self.mean_left 300 | cdef double* mean_right = self.mean_right 301 | cdef double* sq_sum_left = self.sq_sum_left 302 | cdef double* sq_sum_right = self.sq_sum_right 303 | cdef double* var_left = self.var_left 304 | cdef double* var_right = self.var_right 305 | cdef double* sum_left = self.sum_left 306 | cdef double* sum_right = self.sum_right 307 | 308 | cdef double weighted_n_left = self.weighted_n_left 309 | cdef double weighted_n_right = self.weighted_n_right 310 | 311 | cdef SIZE_t i 312 | cdef SIZE_t p 313 | cdef SIZE_t k 314 | cdef DOUBLE_t w = 1.0 315 | cdef DOUBLE_t y_ik, w_y_ik 316 | 317 | # Note: We assume start <= pos < new_pos <= end 318 | for p in range(pos, new_pos): 319 | i = samples[p] 320 | 321 | if sample_weight != NULL: 322 | w = sample_weight[i] 323 | 324 | for k in range(n_outputs): 325 | y_ik = y[i * y_stride + k] 326 | w_y_ik = w * y_ik 327 | 328 | sum_left[k] += w_y_ik 329 | sum_right[k] -= w_y_ik 330 | 331 | sq_sum_left[k] += w_y_ik * y_ik 332 | sq_sum_right[k] -= w_y_ik * y_ik 333 | 334 | weighted_n_left += w 335 | weighted_n_right -= w 336 | 337 | for k in range(n_outputs): 338 | mean_left[k] = sum_left[k] / weighted_n_left 339 | mean_right[k] = sum_right[k] / weighted_n_right 340 | var_left[k] = (sq_sum_left[k] / weighted_n_left - 341 | mean_left[k] * mean_left[k]) 342 | var_right[k] = (sq_sum_right[k] / weighted_n_right - 343 | mean_right[k] * mean_right[k]) 344 | 345 | self.weighted_n_left = weighted_n_left 346 | self.weighted_n_right = weighted_n_right 347 | 348 | self.pos = new_pos 349 | 350 | cdef double node_impurity(self) nogil: 351 | """Evaluate the impurity of the current node, i.e. the impurity of 352 | samples[start:end].""" 353 | cdef SIZE_t n_outputs = self.n_outputs 354 | cdef double* sq_sum_total = self.sq_sum_total 355 | cdef double* mean_total = self.mean_total 356 | cdef double weighted_n_node_samples = self.weighted_n_node_samples 357 | cdef double total = 0.0 358 | cdef SIZE_t k 359 | 360 | for k in range(n_outputs): 361 | total += (sq_sum_total[k] / weighted_n_node_samples - 362 | mean_total[k] * mean_total[k]) 363 | 364 | return total / n_outputs 365 | 366 | cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil: 367 | """Evaluate the impurity in children nodes, i.e. the impurity of the 368 | left child (samples[start:pos]) and the impurity the right child 369 | (samples[pos:end]).""" 370 | cdef SIZE_t n_outputs = self.n_outputs 371 | cdef double* var_left = self.var_left 372 | cdef double* var_right = self.var_right 373 | cdef double total_left = 0.0 374 | cdef double total_right = 0.0 375 | cdef SIZE_t k 376 | 377 | for k in range(n_outputs): 378 | total_left += var_left[k] 379 | total_right += var_right[k] 380 | 381 | impurity_left[0] = total_left / n_outputs 382 | impurity_right[0] = total_right / n_outputs 383 | 384 | cdef void node_value(self, double* dest) nogil: 385 | """Compute the node value of samples[start:end] into dest.""" 386 | cdef SIZE_t n_outputs = self.n_outputs 387 | cdef SIZE_t* n_classes = self.n_classes 388 | cdef SIZE_t label_count_stride = self.label_count_stride 389 | 390 | cdef DOUBLE_t* y = self.y 391 | cdef SIZE_t y_stride = self.y_stride 392 | cdef DOUBLE_t* sample_weight = self.sample_weight 393 | cdef SIZE_t* samples = self.samples 394 | cdef SIZE_t start = self.start 395 | cdef SIZE_t end = self.end 396 | 397 | cdef SIZE_t c 398 | cdef DOUBLE_t w = 1. 399 | 400 | cdef SIZE_t i = 0 401 | cdef SIZE_t p = 0 402 | cdef SIZE_t k = 0 403 | cdef SIZE_t offset = 0 404 | 405 | for k in range(n_outputs): 406 | memset(dest + offset, 0, n_classes[k] * sizeof(double)) 407 | offset += label_count_stride 408 | 409 | for p in range(start, end): 410 | i = samples[p] 411 | 412 | if sample_weight != NULL: 413 | w = sample_weight[i] 414 | 415 | for k in range(n_outputs): 416 | c = y[i * y_stride + k] 417 | dest[k * label_count_stride + c] += w 418 | 419 | 420 | 421 | # ============================================================================= 422 | # Custom splitter 423 | # ============================================================================= 424 | 425 | cdef class SplitterTransformer(Splitter): 426 | """Base splitter for working on a transformed space""" 427 | 428 | cdef Splitter splitter # Splitter used for the data 429 | 430 | cdef np.ndarray y_transformed 431 | cdef DOUBLE_t* y_transformed_data # Storage of transformed output 432 | cdef SIZE_t y_transformed_stride # Stride of transformed output 433 | 434 | 435 | def __getstate__(self): 436 | return {"splitter": self.splitter, 437 | "y_transformed": self.y_transformed} 438 | 439 | def __setstate__(self, d): 440 | self.set_output_space(d["splitter"], d["y_transformed"]) 441 | 442 | def __reduce__(self): 443 | return (SplitterTransformer, (self.criterion, 444 | self.max_features, 445 | self.min_samples_leaf, 446 | self.min_weight_leaf, 447 | self.random_state), self.__getstate__()) 448 | 449 | 450 | def set_output_space(self, 451 | Splitter splitter, 452 | np.ndarray[DOUBLE_t, ndim=2, mode="c"] y): 453 | 454 | # Set transformed output space 455 | self.y_transformed = y 456 | self.y_transformed_data = y.data 457 | self.y_transformed_stride = ( y.strides[0] / 458 | y.itemsize) 459 | 460 | # Set sub-splitter and its criterion 461 | self.splitter = splitter 462 | 463 | 464 | cdef void init(self, np.ndarray[DTYPE_t, ndim=2] X, 465 | np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, 466 | DOUBLE_t* sample_weight) except *: 467 | """Initialize the splitter.""" 468 | 469 | if not self.splitter: 470 | raise ValueError('Unspecify base splitter') 471 | 472 | if self.y_transformed_data == NULL: 473 | raise ValueError("Unspectify subspace use set_output_space") 474 | 475 | # Initialize the splitter 476 | self.splitter.criterion = MSE(self.y_transformed.shape[1]) 477 | self.splitter.init(X, self.y_transformed, sample_weight) 478 | self.n_samples = self.splitter.n_samples 479 | 480 | 481 | # State of the splitter 482 | self.y = y.data 483 | self.y_stride = ( y.strides[0] / y.itemsize) 484 | 485 | 486 | cdef void node_reset(self, SIZE_t start, SIZE_t end, 487 | double* weighted_n_node_samples) nogil: 488 | """Reset splitter on node samples[start:end].""" 489 | # Reset the base splitter 490 | self.start = start 491 | self.end = end 492 | self.splitter.node_reset(start, end, weighted_n_node_samples) 493 | 494 | cdef void node_split(self, double impurity, 495 | SplitRecord* split, 496 | SIZE_t* n_constant_features) nogil: 497 | """Find a split on node samples[start:end].""" 498 | self.splitter.node_split(impurity, split, n_constant_features) 499 | 500 | cdef void node_value(self, double* dest) nogil: 501 | """Copy the value of node samples[start:end] into dest.""" 502 | self.criterion.init(self.y, 503 | self.y_stride, 504 | self.splitter.sample_weight, 505 | self.splitter.weighted_n_samples, 506 | self.splitter.samples, 507 | self.splitter.start, 508 | self.splitter.end) 509 | self.criterion.node_value(dest) 510 | 511 | cdef double node_impurity(self) nogil: 512 | """Impurity at the node""" 513 | return self.splitter.node_impurity() 514 | -------------------------------------------------------------------------------- /random_output_trees/_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities""" 2 | 3 | # Originally from sklearn.utils.validation 4 | # Authors: Olivier Grisel 5 | # Gael Varoquaux 6 | # Andreas Mueller 7 | # Lars Buitinck 8 | # Alexandre Gramfort 9 | # Nicolas Tresegnie 10 | # License: BSD 3 clause 11 | 12 | import warnings 13 | from inspect import getargspec 14 | 15 | import numpy as np 16 | import scipy.sparse as sp 17 | 18 | 19 | class DataConversionWarning(UserWarning): 20 | "A warning on implicit data conversions happening in the code" 21 | pass 22 | 23 | warnings.simplefilter("always", DataConversionWarning) 24 | 25 | 26 | def _assert_all_finite(X): 27 | """Like assert_all_finite, but only for ndarray.""" 28 | X = np.asanyarray(X) 29 | # First try an O(n) time, O(1) space solution for the common case that 30 | # everything is finite; fall back to O(n) space np.isfinite to prevent 31 | # false positives from overflow in sum method. 32 | if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) 33 | and not np.isfinite(X).all()): 34 | raise ValueError("Input contains NaN, infinity" 35 | " or a value too large for %r." % X.dtype) 36 | 37 | def _num_samples(x): 38 | """Return number of samples in array-like x.""" 39 | if not hasattr(x, '__len__') and not hasattr(x, 'shape'): 40 | if hasattr(x, '__array__'): 41 | x = np.asarray(x) 42 | else: 43 | raise TypeError("Expected sequence or array-like, got %r" % x) 44 | return x.shape[0] if hasattr(x, 'shape') else len(x) 45 | 46 | 47 | def check_consistent_length(*arrays): 48 | """Check that all arrays have consistent first dimensions. 49 | 50 | Checks whether all objects in arrays have the same shape or length. 51 | 52 | Parameters 53 | ---------- 54 | arrays : list or tuple of input objects. 55 | Objects that will be checked for consistent length. 56 | """ 57 | 58 | uniques = np.unique([_num_samples(X) for X in arrays if X is not None]) 59 | if len(uniques) > 1: 60 | raise ValueError("Found arrays with inconsistent numbers of samples: %s" 61 | % str(uniques)) 62 | 63 | 64 | def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy, 65 | force_all_finite): 66 | """Convert a sparse matrix to a given format. 67 | 68 | Checks the sparse format of spmatrix and converts if necessary. 69 | 70 | Parameters 71 | ---------- 72 | spmatrix : scipy sparse matrix 73 | Input to validate and convert. 74 | 75 | accept_sparse : string, list of string or None (default=None) 76 | String[s] representing allowed sparse matrix formats ('csc', 77 | 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). None means that sparse 78 | matrix input will raise an error. If the input is sparse but not in 79 | the allowed format, it will be converted to the first listed format. 80 | 81 | dtype : string, type or None (default=none) 82 | Data type of result. If None, the dtype of the input is preserved. 83 | 84 | order : 'F', 'C' or None (default=None) 85 | Whether an array will be forced to be fortran or c-style. 86 | 87 | copy : boolean (default=False) 88 | Whether a forced copy will be triggered. If copy=False, a copy might 89 | be triggered by a conversion. 90 | 91 | force_all_finite : boolean (default=True) 92 | Whether to raise an error on np.inf and np.nan in X. 93 | 94 | Returns 95 | ------- 96 | spmatrix_converted : scipy sparse matrix. 97 | Matrix that is ensured to have an allowed type. 98 | """ 99 | if accept_sparse is None: 100 | raise TypeError('A sparse matrix was passed, but dense ' 101 | 'data is required. Use X.toarray() to ' 102 | 'convert to a dense numpy array.') 103 | sparse_type = spmatrix.format 104 | if dtype is None: 105 | dtype = spmatrix.dtype 106 | if sparse_type in accept_sparse: 107 | # correct type 108 | if dtype == spmatrix.dtype: 109 | # correct dtype 110 | if copy: 111 | spmatrix = spmatrix.copy() 112 | else: 113 | # convert dtype 114 | spmatrix = spmatrix.astype(dtype) 115 | else: 116 | # create new 117 | spmatrix = spmatrix.asformat(accept_sparse[0]).astype(dtype) 118 | if force_all_finite: 119 | if not hasattr(spmatrix, "data"): 120 | warnings.warn("Can't check %s sparse matrix for nan or inf." 121 | % spmatrix.format) 122 | else: 123 | _assert_all_finite(spmatrix.data) 124 | if hasattr(spmatrix, "data"): 125 | spmatrix.data = np.array(spmatrix.data, copy=False, order=order) 126 | return spmatrix 127 | 128 | 129 | def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, 130 | force_all_finite=True, ensure_2d=True, allow_nd=False): 131 | """Input validation on an array, list, sparse matrix or similar. 132 | 133 | By default, the input is converted to an at least 2nd numpy array. 134 | 135 | Parameters 136 | ---------- 137 | array : object 138 | Input object to check / convert. 139 | 140 | accept_sparse : string, list of string or None (default=None) 141 | String[s] representing allowed sparse matrix formats, such as 'csc', 142 | 'csr', etc. None means that sparse matrix input will raise an error. 143 | If the input is sparse but not in the allowed format, it will be 144 | converted to the first listed format. 145 | 146 | dtype : string, type or None (default=none) 147 | Data type of result. If None, the dtype of the input is preserved. 148 | 149 | order : 'F', 'C' or None (default=None) 150 | Whether an array will be forced to be fortran or c-style. 151 | 152 | copy : boolean (default=False) 153 | Whether a forced copy will be triggered. If copy=False, a copy might 154 | be triggered by a conversion. 155 | 156 | force_all_finite : boolean (default=True) 157 | Whether to raise an error on np.inf and np.nan in X. 158 | 159 | ensure_2d : boolean (default=True) 160 | Whether to make X at least 2d. 161 | 162 | allow_nd : boolean (default=False) 163 | Whether to allow X.ndim > 2. 164 | 165 | Returns 166 | ------- 167 | X_converted : object 168 | The converted and validated X. 169 | """ 170 | if isinstance(accept_sparse, str): 171 | accept_sparse = [accept_sparse] 172 | 173 | if sp.issparse(array): 174 | array = _ensure_sparse_format(array, accept_sparse, dtype, order, 175 | copy, force_all_finite) 176 | else: 177 | if ensure_2d: 178 | array = np.atleast_2d(array) 179 | array = np.array(array, dtype=dtype, order=order, copy=copy) 180 | if not allow_nd and array.ndim >= 3: 181 | raise ValueError("Found array with dim %d. Expected <= 2" % 182 | array.ndim) 183 | if force_all_finite: 184 | _assert_all_finite(array) 185 | 186 | return array 187 | 188 | 189 | def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, 190 | force_all_finite=True, ensure_2d=True, allow_nd=False, 191 | multi_output=False): 192 | """Input validation for standard estimators. 193 | 194 | Checks X and y for consistent length, enforces X 2d and y 1d. 195 | Standard input checks are only applied to y. For multi-label y, 196 | set multi_ouput=True to allow 2d and sparse y. 197 | 198 | Parameters 199 | ---------- 200 | X : nd-array, list or sparse matrix 201 | Input data. 202 | 203 | y : nd-array, list or sparse matrix 204 | Labels. 205 | 206 | accept_sparse : string, list of string or None (default=None) 207 | String[s] representing allowed sparse matrix formats, such as 'csc', 208 | 'csr', etc. None means that sparse matrix input will raise an error. 209 | If the input is sparse but not in the allowed format, it will be 210 | converted to the first listed format. 211 | 212 | dtype : string, type or None (default=none) 213 | Data type of result. If None, the dtype of the input is preserved. 214 | 215 | order : 'F', 'C' or None (default=None) 216 | Whether an array will be forced to be fortran or c-style. 217 | 218 | copy : boolean (default=False) 219 | Whether a forced copy will be triggered. If copy=False, a copy might 220 | be triggered by a conversion. 221 | 222 | force_all_finite : boolean (default=True) 223 | Whether to raise an error on np.inf and np.nan in X. 224 | 225 | ensure_2d : boolean (default=True) 226 | Whether to make X at least 2d. 227 | 228 | allow_nd : boolean (default=False) 229 | Whether to allow X.ndim > 2. 230 | 231 | multi_output : boolean (default=False) 232 | Whether to allow 2-d y (array or sparse matrix). If false, y will be 233 | validated as a vector. 234 | 235 | Returns 236 | ------- 237 | X_converted : object 238 | The converted and validated X. 239 | """ 240 | X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite, 241 | ensure_2d, allow_nd) 242 | if multi_output: 243 | y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False) 244 | else: 245 | y = column_or_1d(y, warn=True) 246 | _assert_all_finite(y) 247 | 248 | check_consistent_length(X, y) 249 | 250 | return X, y 251 | 252 | 253 | def column_or_1d(y, warn=False): 254 | """ Ravel column or 1d numpy array, else raises an error 255 | 256 | Parameters 257 | ---------- 258 | y : array-like 259 | 260 | Returns 261 | ------- 262 | y : array 263 | 264 | """ 265 | shape = np.shape(y) 266 | if len(shape) == 1: 267 | return np.ravel(y) 268 | if len(shape) == 2 and shape[1] == 1: 269 | if warn: 270 | warnings.warn("A column-vector y was passed when a 1d array was" 271 | " expected. Please change the shape of y to " 272 | "(n_samples, ), for example using ravel().", 273 | DataConversionWarning, stacklevel=2) 274 | return np.ravel(y) 275 | 276 | raise ValueError("bad input shape {0}".format(shape)) 277 | 278 | 279 | def has_fit_parameter(estimator, parameter): 280 | """ Checks whether the estimator's fit method supports the given parameter. 281 | Example 282 | ------- 283 | >>> from sklearn.svm import SVC 284 | >>> has_fit_parameter(SVC(), "sample_weight") 285 | True 286 | """ 287 | return parameter in getargspec(estimator.fit)[0] 288 | 289 | 290 | def skipped(func): 291 | from nose.plugins.skip import SkipTest 292 | 293 | def _func(): 294 | raise SkipTest("Test %s is skipped" % func.__name__) 295 | _func.__name__ = func.__name__ 296 | return _func 297 | -------------------------------------------------------------------------------- /random_output_trees/datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Module for datasets loading and fetchers. 4 | 5 | """ 6 | 7 | from __future__ import division, print_function, absolute_import 8 | 9 | import os 10 | from functools import partial 11 | 12 | import shutil 13 | import tarfile 14 | 15 | try: 16 | # Python 2 17 | from urllib2 import HTTPError 18 | from urllib2 import quote 19 | from urllib2 import urlopen 20 | except ImportError: 21 | # Python 3+ 22 | from urllib.error import HTTPError 23 | from urllib.parse import quote 24 | from urllib.request import urlopen 25 | 26 | 27 | import numpy as np 28 | 29 | from sklearn.datasets import get_data_home 30 | from sklearn.datasets.base import Bunch 31 | 32 | 33 | __all__ = [ 34 | "fetch_drug_interaction", 35 | "fetch_protein_interaction", 36 | ] 37 | 38 | 39 | def _fetch_drug_protein(data_home=None): 40 | """Fetch drug-protein dataset from the server""" 41 | 42 | base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/" 43 | 44 | # check if this data set has been already downloaded 45 | data_home = get_data_home(data_home) 46 | data_home = os.path.join(data_home, 'drug-protein') 47 | if not os.path.exists(data_home): 48 | os.makedirs(data_home) 49 | 50 | for base_name in ["drug_repmat.txt", "target_repmat.txt", 51 | "inter_admat.txt"]: 52 | filename = os.path.join(data_home, base_name) 53 | 54 | if not os.path.exists(filename): 55 | urlname = base_url + base_name 56 | 57 | print("Download data at {}".format(urlname)) 58 | 59 | try: 60 | url = urlopen(urlname) 61 | except HTTPError as e: 62 | if e.code == 404: 63 | e.msg = "Dataset drug-protein '%s' not found." % base_name 64 | raise 65 | 66 | try: 67 | with open(filename, 'w+b') as fhandle: 68 | shutil.copyfileobj(url, fhandle) 69 | except: 70 | os.remove(filename) 71 | raise 72 | 73 | url.close() 74 | 75 | return data_home 76 | 77 | 78 | def fetch_drug_interaction(data_home=None): 79 | """Fetch the drug-interaction dataset 80 | 81 | Constant features were removed. 82 | 83 | =========================== =================================== 84 | Domain drug-protein interaction network 85 | Features Biological (see [1]) 86 | output interaction network 87 | Drug matrix (sample, features) = (1862, 660) 88 | Newtork interaction matrix (samples, labels) = (1862, 1554) 89 | =========================== =================================== 90 | 91 | 92 | Parameters 93 | ---------- 94 | data_home: optional, default: None 95 | Specify another download and cache folder for the data sets. By default 96 | all scikit learn data is stored in '~/scikit_learn_data' subfolders. 97 | 98 | Returns 99 | ------- 100 | data : Bunch 101 | Dictionary-like object, the interesting attributes are: 102 | 'data', the data to learn, 'target', the classification labels, 103 | 'target_names', the original names of the target columns and 104 | 'feature_names', the original names of the dataset columns. 105 | 106 | References 107 | ---------- 108 | .. [1] Yamanishi, Y., Pauwels, E., Saigo, H., & Stoven, V. (2011). 109 | Extracting sets of chemical substructures and protein domains 110 | governing drug-target interactions. Journal of chemical information 111 | and modeling, 51(5), 1183-1194. 112 | 113 | """ 114 | data_home = _fetch_drug_protein(data_home=data_home) 115 | 116 | drug_fname = os.path.join(data_home, "drug_repmat.txt") 117 | data = np.loadtxt(drug_fname, dtype=float, skiprows=1) 118 | data = data[:, 1:] # skip id column 119 | mask_constant = np.var(data, axis=0) != 0. 120 | data = data[:, mask_constant] # remove constant columns 121 | 122 | with open(drug_fname, 'r') as fhandle: 123 | feature_names = fhandle.readline().split("\t") 124 | feature_names = np.array(feature_names)[mask_constant].tolist() 125 | 126 | interaction_fname = os.path.join(data_home, "inter_admat.txt") 127 | target = np.loadtxt(interaction_fname, dtype=float, skiprows=1) 128 | target = target[:, 1:] # skip id column 129 | with open(interaction_fname, 'r') as fhandle: 130 | target_names = fhandle.readline().split("\t") 131 | 132 | return Bunch(data=data, target=target, feature_names=feature_names, 133 | target_names=target_names) 134 | 135 | 136 | def fetch_protein_interaction(data_home=None): 137 | """Fetch the protein-interaction dataset 138 | 139 | Constant features were removed 140 | 141 | =========================== =================================== 142 | Domain drug-protein interaction network 143 | Features Biological (see [1]) 144 | output interaction network 145 | Drug matrix (sample, features) = (1554, 876) 146 | Newtork interaction matrix (samples, labels) = (1554, 1862) 147 | =========================== =================================== 148 | 149 | Parameters 150 | ---------- 151 | data_home: optional, default: None 152 | Specify another download and cache folder for the data sets. By default 153 | all scikit learn data is stored in '~/scikit_learn_data' subfolders. 154 | 155 | Returns 156 | ------- 157 | data : Bunch 158 | Dictionary-like object, the interesting attributes are: 159 | 'data', the data to learn, 'target', the classification labels and 160 | 'feature_names', the original names of the dataset columns. 161 | 162 | References 163 | ---------- 164 | .. [1] Yamanishi, Y., Pauwels, E., Saigo, H., & Stoven, V. (2011). 165 | Extracting sets of chemical substructures and protein domains 166 | governing drug-target interactions. Journal of chemical information 167 | and modeling, 51(5), 1183-1194. 168 | 169 | """ 170 | data_home = _fetch_drug_protein(data_home=data_home) 171 | 172 | protein_fname = os.path.join(data_home, "target_repmat.txt") 173 | data = np.loadtxt(protein_fname, dtype=float, skiprows=1, 174 | usecols=range(1, 877)) # skip id column 175 | 176 | mask_constant = np.var(data, axis=0) != 0. 177 | data = data[:, mask_constant] # remove constant columns 178 | 179 | with open(protein_fname, 'r') as fhandle: 180 | feature_names = fhandle.readline().split("\t") 181 | feature_names = np.array(feature_names)[mask_constant].tolist() 182 | 183 | interaction_fname = os.path.join(data_home, "inter_admat.txt") 184 | target = np.loadtxt(interaction_fname, dtype=float, skiprows=1) 185 | target = target[:, 1:] # skip id column 186 | target = target.T 187 | 188 | return Bunch(data=data, target=target, feature_names=feature_names) 189 | -------------------------------------------------------------------------------- /random_output_trees/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module provides ensemble estimators which work transformed output-space. 3 | 4 | ''' 5 | 6 | from .forest import RandomForestClassifier 7 | from .forest import RandomForestRegressor 8 | from .forest import ExtraTreesClassifier 9 | from .forest import ExtraTreesRegressor 10 | from .lazy_bagging import LazyBaggingClassifier 11 | from .lazy_bagging import LazyBaggingRegressor 12 | 13 | __all__ = [ 14 | "RandomForestClassifier", 15 | "RandomForestRegressor", 16 | "ExtraTreesClassifier", 17 | "ExtraTreesRegressor", 18 | ] 19 | -------------------------------------------------------------------------------- /random_output_trees/ensemble/tests/test_forest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing for the forest module (sklearn.ensemble.forest). 3 | """ 4 | 5 | # Most tests comes from scikit-learn and ensure that everything is working 6 | # as expected 7 | 8 | # Authors: Gilles Louppe, 9 | # Brian Holt, 10 | # Andreas Mueller, 11 | # Arnaud Joly 12 | # License: BSD 3 clause 13 | 14 | 15 | 16 | from sklearn.utils.testing import assert_almost_equal 17 | from sklearn.utils.testing import assert_equal 18 | 19 | from sklearn import datasets 20 | from sklearn.utils.validation import check_random_state 21 | 22 | from sklearn.cross_validation import train_test_split 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.base import BaseEstimator, TransformerMixin 25 | 26 | from random_output_trees.transformer import FixedStateTransformer 27 | from random_output_trees.ensemble import ExtraTreesClassifier 28 | from random_output_trees.ensemble import ExtraTreesRegressor 29 | from random_output_trees.ensemble import RandomForestClassifier 30 | from random_output_trees.ensemble import RandomForestRegressor 31 | 32 | 33 | # toy sample 34 | X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] 35 | y = [-1, -1, -1, 1, 1, 1] 36 | T = [[-1, -1], [2, 2], [3, 2]] 37 | true_result = [-1, 1, 1] 38 | 39 | # also load the iris dataset 40 | # and randomly permute it 41 | iris = datasets.load_iris() 42 | rng = check_random_state(0) 43 | perm = rng.permutation(iris.target.size) 44 | iris.data = iris.data[perm] 45 | iris.target = iris.target[perm] 46 | 47 | # also load the boston dataset 48 | # and randomly permute it 49 | boston = datasets.load_boston() 50 | perm = rng.permutation(boston.target.size) 51 | boston.data = boston.data[perm] 52 | boston.target = boston.target[perm] 53 | 54 | FOREST_CLASSIFIERS = { 55 | "ExtraTreesClassifier": ExtraTreesClassifier, 56 | "RandomForestClassifier": RandomForestClassifier, 57 | } 58 | 59 | FOREST_REGRESSORS = { 60 | "ExtraTreesRegressor": ExtraTreesRegressor, 61 | "RandomForestRegressor": RandomForestRegressor, 62 | } 63 | 64 | FOREST_TRANSFORMERS = {} 65 | 66 | FOREST_ESTIMATORS = dict() 67 | FOREST_ESTIMATORS.update(FOREST_CLASSIFIERS) 68 | FOREST_ESTIMATORS.update(FOREST_REGRESSORS) 69 | FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) 70 | 71 | 72 | class IdentityProjections(BaseEstimator, TransformerMixin): 73 | """ Project the input data on the identity matrix (noop operation)""" 74 | def __init__(self): 75 | pass 76 | 77 | def fit(self, X, y=None): 78 | return self 79 | 80 | def transform(selft, X): 81 | return X 82 | 83 | 84 | def test_output_transformer(): 85 | X, y = datasets.make_multilabel_classification(return_indicator=True) 86 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 87 | 88 | # Check that random_state are different 89 | transformer = GaussianRandomProjection(n_components=5, random_state=None) 90 | for name, ForestEstimator in FOREST_ESTIMATORS.items(): 91 | est = ForestEstimator(random_state=5, output_transformer=transformer) 92 | est.fit(X_train, y_train) 93 | y_pred = est.predict(X_test) 94 | assert_equal(y_pred.shape, y_test.shape) 95 | 96 | random_state = [sub.output_transformer_.random_state 97 | for sub in est.estimators_] 98 | 99 | assert_equal(len(set(random_state)), est.n_estimators) 100 | 101 | 102 | # Check that random_state are equals 103 | transformer = FixedStateTransformer(GaussianRandomProjection( 104 | n_components=5), random_seed=0) 105 | for name, ForestEstimator in FOREST_ESTIMATORS.items(): 106 | est = ForestEstimator(random_state=5, output_transformer=transformer) 107 | est.fit(X_train, y_train) 108 | y_pred = est.predict(X_test) 109 | assert_equal(y_pred.shape, y_test.shape) 110 | 111 | 112 | random_state = [sub.output_transformer_.random_state 113 | for sub in est.estimators_] 114 | 115 | assert_equal(len(set(random_state)), 1) 116 | assert_equal(random_state[0], 0) 117 | 118 | 119 | def test_identity_output_transformer(): 120 | X, y = datasets.make_multilabel_classification(return_indicator=True, 121 | random_state=0) 122 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 123 | 124 | for name, ForestEstimator in FOREST_ESTIMATORS.items(): 125 | est = ForestEstimator(random_state=0, max_features=None, max_depth=4) 126 | est.fit(X_train, y_train) 127 | y_pred_origin = est.predict(X_test) 128 | 129 | 130 | est_transf = est.set_params(output_transformer=IdentityProjections()) 131 | est_transf.fit(X_train, y_train) 132 | y_pred_transformed = est_transf.predict(X_test) 133 | assert_almost_equal(y_pred_origin, y_pred_transformed) 134 | 135 | 136 | if __name__ == "__main__": 137 | import nose 138 | nose.runmodule() 139 | -------------------------------------------------------------------------------- /random_output_trees/ensemble/tests/test_lazy_bagging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing for the lazy bagging ensemble module 3 | 4 | Many tests are adapted from sklearn/ensemble/tests/test_bagging.py 5 | """ 6 | 7 | # Author: Gilles Louppe 8 | # License: BSD 3 clause 9 | 10 | # Author: Arnaud Joly 11 | # License: BSD 3 clause 12 | import numpy as np 13 | 14 | from sklearn.utils.testing import assert_array_equal 15 | from sklearn.utils.testing import assert_array_almost_equal 16 | from sklearn.utils.testing import assert_equal 17 | from sklearn.utils.testing import assert_raises 18 | from sklearn.utils.testing import assert_greater 19 | from sklearn.utils.testing import assert_less 20 | from sklearn.utils.testing import assert_true 21 | from sklearn.utils.testing import assert_warns 22 | from sklearn.utils.testing import assert_almost_equal 23 | 24 | from sklearn.base import clone 25 | from sklearn.dummy import DummyClassifier, DummyRegressor 26 | from sklearn.grid_search import GridSearchCV, ParameterGrid 27 | from sklearn.linear_model import Perceptron 28 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 29 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 30 | from sklearn.svm import SVC, SVR 31 | from sklearn.cross_validation import train_test_split 32 | from sklearn.datasets import load_boston 33 | from sklearn.datasets import load_iris 34 | from sklearn.datasets import make_multilabel_classification 35 | from sklearn.utils import check_random_state 36 | 37 | from scipy.sparse import csc_matrix, csr_matrix 38 | 39 | from random_output_trees.ensemble import LazyBaggingClassifier 40 | from random_output_trees.ensemble import LazyBaggingRegressor 41 | 42 | 43 | rng = check_random_state(0) 44 | 45 | # also load the iris dataset 46 | # and randomly permute it 47 | iris = load_iris() 48 | perm = rng.permutation(iris.target.size) 49 | iris.data = iris.data[perm] 50 | iris.target = iris.target[perm] 51 | 52 | # also load the boston dataset 53 | # and randomly permute it 54 | boston = load_boston() 55 | perm = rng.permutation(boston.target.size) 56 | boston.data = boston.data[perm] 57 | boston.target = boston.target[perm] 58 | 59 | 60 | def test_classification(): 61 | """Check classification for various parameter settings.""" 62 | rng = check_random_state(0) 63 | X_train, X_test, y_train, y_test = train_test_split(iris.data, 64 | iris.target, 65 | random_state=rng) 66 | grid = ParameterGrid({"max_samples": [0.5, 1.0], 67 | "max_features": [1, 2, 4], 68 | "bootstrap": [True, False], 69 | "bootstrap_features": [True, False]}) 70 | 71 | for base_estimator in [None, 72 | DummyClassifier(), 73 | Perceptron(), 74 | DecisionTreeClassifier(), 75 | KNeighborsClassifier(), 76 | SVC()]: 77 | for params in grid: 78 | LazyBaggingClassifier(base_estimator=base_estimator, 79 | random_state=rng, 80 | **params).fit(X_train, y_train).predict(X_test) 81 | 82 | 83 | def test_sparse_classification(): 84 | """Check classification for various parameter settings on sparse input.""" 85 | 86 | class CustomSVC(SVC): 87 | """SVC variant that records the nature of the training set""" 88 | 89 | def fit(self, X, y): 90 | super(CustomSVC, self).fit(X, y) 91 | self.data_type_ = type(X) 92 | return self 93 | 94 | rng = check_random_state(0) 95 | X_train, X_test, y_train, y_test = train_test_split(iris.data, 96 | iris.target, 97 | random_state=rng) 98 | parameter_sets = [ 99 | {"max_samples": 0.5, 100 | "max_features": 2, 101 | "bootstrap": True, 102 | "bootstrap_features": True}, 103 | {"max_samples": 1.0, 104 | "max_features": 4, 105 | "bootstrap": True, 106 | "bootstrap_features": True}, 107 | {"max_features": 2, 108 | "bootstrap": False, 109 | "bootstrap_features": True}, 110 | {"max_samples": 0.5, 111 | "bootstrap": True, 112 | "bootstrap_features": False}, 113 | ] 114 | 115 | for sparse_format in [csc_matrix, csr_matrix]: 116 | X_train_sparse = sparse_format(X_train) 117 | X_test_sparse = sparse_format(X_test) 118 | for params in parameter_sets: 119 | 120 | # Trained on sparse format 121 | sparse_classifier = LazyBaggingClassifier( 122 | base_estimator=CustomSVC(), 123 | random_state=1, 124 | **params 125 | ).fit(X_train_sparse, y_train) 126 | sparse_results = sparse_classifier.predict(X_test_sparse) 127 | 128 | # Trained on dense format 129 | dense_results = LazyBaggingClassifier( 130 | base_estimator=CustomSVC(), 131 | random_state=1, 132 | **params 133 | ).fit(X_train, y_train).predict(X_test) 134 | 135 | sparse_type = type(X_train_sparse) 136 | types = [i.data_type_ for i in sparse_classifier.estimators_] 137 | 138 | assert_array_equal(sparse_results, dense_results) 139 | assert all([t == sparse_type for t in types]) 140 | 141 | 142 | def test_regression(): 143 | """Check regression for various parameter settings.""" 144 | rng = check_random_state(0) 145 | X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], 146 | boston.target[:50], 147 | random_state=rng) 148 | grid = ParameterGrid({"max_samples": [0.5, 1.0], 149 | "max_features": [0.5, 1.0], 150 | "bootstrap": [True, False], 151 | "bootstrap_features": [True, False]}) 152 | 153 | for base_estimator in [None, 154 | DummyRegressor(), 155 | DecisionTreeRegressor(), 156 | KNeighborsRegressor(), 157 | SVR()]: 158 | for params in grid: 159 | LazyBaggingRegressor(base_estimator=base_estimator, 160 | random_state=rng, 161 | **params).fit(X_train, 162 | y_train).predict(X_test) 163 | 164 | 165 | def test_sparse_regression(): 166 | """Check regression for various parameter settings on sparse input.""" 167 | rng = check_random_state(0) 168 | X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], 169 | boston.target[:50], 170 | random_state=rng) 171 | 172 | class CustomSVR(SVR): 173 | """SVC variant that records the nature of the training set""" 174 | 175 | def fit(self, X, y): 176 | super(CustomSVR, self).fit(X, y) 177 | self.data_type_ = type(X) 178 | return self 179 | 180 | parameter_sets = [ 181 | {"max_samples": 0.5, 182 | "max_features": 2, 183 | "bootstrap": True, 184 | "bootstrap_features": True}, 185 | {"max_samples": 1.0, 186 | "max_features": 4, 187 | "bootstrap": True, 188 | "bootstrap_features": True}, 189 | {"max_features": 2, 190 | "bootstrap": False, 191 | "bootstrap_features": True}, 192 | {"max_samples": 0.5, 193 | "bootstrap": True, 194 | "bootstrap_features": False}, 195 | ] 196 | 197 | for sparse_format in [csc_matrix, csr_matrix]: 198 | X_train_sparse = sparse_format(X_train) 199 | X_test_sparse = sparse_format(X_test) 200 | for params in parameter_sets: 201 | 202 | # Trained on sparse format 203 | sparse_classifier = LazyBaggingRegressor( 204 | base_estimator=CustomSVR(), 205 | random_state=1, 206 | **params 207 | ).fit(X_train_sparse, y_train) 208 | sparse_results = sparse_classifier.predict(X_test_sparse) 209 | 210 | # Trained on dense format 211 | dense_results = LazyBaggingRegressor( 212 | base_estimator=CustomSVR(), 213 | random_state=1, 214 | **params 215 | ).fit(X_train, y_train).predict(X_test) 216 | 217 | sparse_type = type(X_train_sparse) 218 | types = [i.data_type_ for i in sparse_classifier.estimators_] 219 | 220 | assert_array_equal(sparse_results, dense_results) 221 | assert all([t == sparse_type for t in types]) 222 | assert_array_equal(sparse_results, dense_results) 223 | 224 | 225 | def test_bootstrap_samples(): 226 | """Test that bootstraping samples generate non-perfect base estimators.""" 227 | rng = check_random_state(0) 228 | X_train, X_test, y_train, y_test = train_test_split(boston.data, 229 | boston.target, 230 | random_state=rng) 231 | 232 | base_estimator = DecisionTreeRegressor().fit(X_train, y_train) 233 | 234 | # without bootstrap, all trees are perfect on the training set 235 | ensemble = LazyBaggingRegressor(base_estimator=DecisionTreeRegressor(), 236 | max_samples=1.0, 237 | bootstrap=False, 238 | random_state=rng).fit(X_train, y_train) 239 | 240 | assert_equal(base_estimator.score(X_train, y_train), 241 | ensemble.score(X_train, y_train)) 242 | 243 | # with bootstrap, trees are no longer perfect on the training set 244 | ensemble = LazyBaggingRegressor(base_estimator=DecisionTreeRegressor(), 245 | max_samples=1.0, 246 | bootstrap=True, 247 | random_state=rng).fit(X_train, y_train) 248 | 249 | assert_greater(base_estimator.score(X_train, y_train), 250 | ensemble.score(X_train, y_train)) 251 | 252 | # NB: we don't save features for memory reasons 253 | # def test_bootstrap_features(): 254 | # """Test that bootstraping features may generate dupplicate features.""" 255 | # rng = check_random_state(0) 256 | # X_train, X_test, y_train, y_test = train_test_split(boston.data, 257 | # boston.target, 258 | # random_state=rng) 259 | 260 | # ensemble = LazyBaggingRegressor(base_estimator=DecisionTreeRegressor(), 261 | # max_features=1.0, 262 | # bootstrap_features=False, 263 | # random_state=rng).fit(X_train, y_train) 264 | 265 | # for features in ensemble.estimators_features_: 266 | # assert_equal(boston.data.shape[1], np.unique(features).shape[0]) 267 | 268 | # ensemble = LazyBaggingRegressor(base_estimator=DecisionTreeRegressor(), 269 | # max_features=1.0, 270 | # bootstrap_features=True, 271 | # random_state=rng).fit(X_train, y_train) 272 | 273 | # for features in ensemble.estimators_features_: 274 | # assert_greater(boston.data.shape[1], np.unique(features).shape[0]) 275 | 276 | 277 | def test_probability(): 278 | """Predict probabilities.""" 279 | rng = check_random_state(0) 280 | X_train, X_test, y_train, y_test = train_test_split(iris.data, 281 | iris.target, 282 | random_state=rng) 283 | 284 | with np.errstate(divide="ignore", invalid="ignore"): 285 | # Normal case 286 | print("start") 287 | ensemble = LazyBaggingClassifier(random_state=rng).fit(X_train, y_train) 288 | 289 | assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), 290 | axis=1), 291 | np.ones(len(X_test))) 292 | 293 | assert_array_almost_equal(ensemble.predict_proba(X_test), 294 | np.exp(ensemble.predict_log_proba(X_test))) 295 | print("stop") 296 | 297 | # Degenerate case, where some classes are missing 298 | ensemble = LazyBaggingClassifier(base_estimator=DecisionTreeClassifier(), 299 | random_state=rng, 300 | max_samples=5).fit(X_train, y_train) 301 | 302 | assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), 303 | axis=1), 304 | np.ones(len(X_test))) 305 | 306 | assert_array_almost_equal(ensemble.predict_proba(X_test), 307 | np.exp(ensemble.predict_log_proba(X_test))) 308 | 309 | 310 | def test_single_estimator(): 311 | """Check singleton ensembles.""" 312 | rng = check_random_state(0) 313 | X_train, X_test, y_train, y_test = train_test_split(boston.data, 314 | boston.target, 315 | random_state=rng) 316 | 317 | clf1 = LazyBaggingRegressor(base_estimator=KNeighborsRegressor(), 318 | n_estimators=1, 319 | bootstrap=False, 320 | bootstrap_features=False, 321 | random_state=rng).fit(X_train, y_train) 322 | 323 | clf2 = KNeighborsRegressor().fit(X_train, y_train) 324 | 325 | assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) 326 | 327 | 328 | def test_error(): 329 | """Test that it gives proper exception on deficient input.""" 330 | X, y = iris.data, iris.target 331 | base = DecisionTreeClassifier() 332 | 333 | # Test max_samples 334 | assert_raises(ValueError, 335 | LazyBaggingClassifier(base, max_samples=-1).fit, X, y) 336 | assert_raises(ValueError, 337 | LazyBaggingClassifier(base, max_samples=0.0).fit, X, y) 338 | assert_raises(ValueError, 339 | LazyBaggingClassifier(base, max_samples=2.0).fit, X, y) 340 | assert_raises(ValueError, 341 | LazyBaggingClassifier(base, max_samples=1000).fit, X, y) 342 | assert_raises(ValueError, 343 | LazyBaggingClassifier(base, max_samples="foobar").fit, X, y) 344 | 345 | # Test max_features 346 | assert_raises(ValueError, 347 | LazyBaggingClassifier(base, max_features=-1).fit, X, y) 348 | assert_raises(ValueError, 349 | LazyBaggingClassifier(base, max_features=0.0).fit, X, y) 350 | assert_raises(ValueError, 351 | LazyBaggingClassifier(base, max_features=2.0).fit, X, y) 352 | assert_raises(ValueError, 353 | LazyBaggingClassifier(base, max_features=5).fit, X, y) 354 | assert_raises(ValueError, 355 | LazyBaggingClassifier(base, max_features="foobar").fit, X, y) 356 | 357 | # Test support of decision_function 358 | assert_raises(NotImplementedError, 359 | LazyBaggingClassifier(base).fit(X, y).decision_function, X) 360 | 361 | 362 | def test_gridsearch(): 363 | """Check that bagging ensembles can be grid-searched.""" 364 | # Transform iris into a binary classification task 365 | X, y = iris.data, iris.target 366 | y[y == 2] = 1 367 | 368 | # Grid search with scoring based on decision_function 369 | parameters = {'n_estimators': (1, 2), 370 | 'base_estimator__C': (1, 2)} 371 | 372 | GridSearchCV(LazyBaggingClassifier(SVC()), 373 | parameters, 374 | scoring="roc_auc").fit(X, y) 375 | 376 | 377 | def test_base_estimator(): 378 | """Check base_estimator and its default values.""" 379 | rng = check_random_state(0) 380 | 381 | # Classification 382 | X_train, X_test, y_train, y_test = train_test_split(iris.data, 383 | iris.target, 384 | random_state=rng) 385 | 386 | ensemble = LazyBaggingClassifier(None, 387 | random_state=0).fit(X_train, y_train) 388 | 389 | assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) 390 | 391 | ensemble = LazyBaggingClassifier(DecisionTreeClassifier(), 392 | random_state=0).fit(X_train, y_train) 393 | 394 | assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) 395 | 396 | ensemble = LazyBaggingClassifier(Perceptron(), 397 | random_state=0).fit(X_train, y_train) 398 | 399 | assert_true(isinstance(ensemble.base_estimator_, Perceptron)) 400 | 401 | # Regression 402 | X_train, X_test, y_train, y_test = train_test_split(boston.data, 403 | boston.target, 404 | random_state=rng) 405 | 406 | ensemble = LazyBaggingRegressor(random_state=0).fit(X_train, y_train) 407 | 408 | assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) 409 | 410 | ensemble = LazyBaggingRegressor(DecisionTreeRegressor(), 411 | random_state=0).fit(X_train, y_train) 412 | 413 | assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) 414 | 415 | ensemble = LazyBaggingRegressor(SVR(), 416 | random_state=0).fit(X_train, y_train) 417 | assert_true(isinstance(ensemble.base_estimator_, SVR)) 418 | 419 | 420 | def test_reproducibility(): 421 | rng = check_random_state(0) 422 | 423 | # Classification 424 | X_train, X_test, y_train, y_test = train_test_split(iris.data, 425 | iris.target, 426 | random_state=rng) 427 | ensemble = LazyBaggingClassifier(random_state=rng) 428 | ensemble.fit(X_train, y_train) 429 | 430 | assert_array_equal(ensemble.predict(X_test), ensemble.predict(X_test)) 431 | 432 | # Regression 433 | X_train, X_test, y_train, y_test = train_test_split(boston.data, 434 | boston.target, 435 | random_state=rng) 436 | ensemble = LazyBaggingRegressor(random_state=rng) 437 | ensemble.fit(X_train, y_train) 438 | assert_array_equal(ensemble.predict(X_test), ensemble.predict(X_test)) 439 | 440 | 441 | def test_multioutput(): 442 | X, y = make_multilabel_classification(n_samples=100, n_labels=1, 443 | n_classes=5, random_state=0, 444 | return_indicator=True) 445 | 446 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 447 | 448 | est = LazyBaggingClassifier(random_state=0, n_estimators=10, 449 | bootstrap=False) 450 | est.fit(X_train, y_train) 451 | 452 | assert_almost_equal(est.score(X_train, y_train), 1.) 453 | 454 | y_proba = est.predict_proba(X_test) 455 | y_log_proba = est.predict_log_proba(X_test) 456 | for p, log_p in zip(y_proba, y_log_proba): 457 | assert_array_almost_equal(p, np.exp(log_p)) 458 | 459 | est = LazyBaggingRegressor(random_state=0, n_estimators=10, 460 | bootstrap=False) 461 | est.fit(X_train, y_train) 462 | assert_almost_equal(est.score(X_train, y_train), 1.) 463 | 464 | 465 | if __name__ == "__main__": 466 | import nose 467 | nose.runmodule() 468 | -------------------------------------------------------------------------------- /random_output_trees/random_projection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module provides dimensionality reduction methods based on random 3 | projection. 4 | 5 | ''' 6 | # Authors: Arnaud Joly 7 | # 8 | # License: BSD 3 clause 9 | 10 | import numpy as np 11 | 12 | from scipy.linalg import hadamard as sp_hadamard 13 | from scipy import sparse 14 | 15 | from sklearn.random_projection import BaseRandomProjection 16 | from sklearn.random_projection import SparseRandomProjection 17 | 18 | from sklearn.utils.random import sample_without_replacement 19 | from sklearn.utils.validation import check_random_state 20 | 21 | __all__ = [ 22 | "RademacherRandomProjection", 23 | "AchlioptasRandomProjection", 24 | "SampledHadamardProjection", 25 | "SampledIdentityProjection", 26 | ] 27 | 28 | 29 | class RademacherRandomProjection(SparseRandomProjection): 30 | """Rademacher random projection 31 | 32 | The components of the random matrix 33 | are drawn from: 34 | 35 | - -sqrt(s) / sqrt(n_components) with probability 1 / 2 36 | - +sqrt(s) / sqrt(n_components) with probability 1 / 2 37 | 38 | Parameters 39 | ---------- 40 | n_components : int or 'auto', optional (default = 'auto') 41 | Dimensionality of the target projection space. 42 | 43 | n_components can be automatically adjusted according to the 44 | number of samples in the dataset and the bound given by the 45 | Johnson-Lindenstrauss lemma. In that case the quality of the 46 | embedding is controlled by the ``eps`` parameter. 47 | 48 | It should be noted that Johnson-Lindenstrauss lemma can yield 49 | very conservative estimated of the required number of components 50 | as it makes no assumption on the structure of the dataset. 51 | 52 | eps : strictly positive float, optional (default=0.1) 53 | Parameter to control the quality of the embedding according to 54 | the Johnson-Lindenstrauss lemma when n_components is set to 55 | 'auto'. 56 | 57 | Smaller values lead to better embedding and higher number of 58 | dimensions (n_components) in the target projection space. 59 | 60 | random_state : integer, RandomState instance or None (default=None) 61 | Control the pseudo random number generator used to generate the 62 | matrix at fit time. 63 | 64 | Attributes 65 | ---------- 66 | ``n_component_`` : int 67 | Concrete number of components computed when n_components="auto". 68 | 69 | ``components_`` : numpy array of shape [n_components, n_features] 70 | Random matrix used for the projection. 71 | 72 | """ 73 | def __init__(self, n_components="auto", eps=0.1, random_state=None): 74 | super(RademacherRandomProjection, self).__init__( 75 | n_components=n_components, 76 | eps=eps, 77 | density=1, 78 | dense_output=True, 79 | random_state=random_state) 80 | 81 | 82 | class AchlioptasRandomProjection(SparseRandomProjection): 83 | """Sparse random projection using Achlioptas random matrix 84 | 85 | If we note `s = 1 / density = 1 / 3 ` the components of the random matrix 86 | are drawn from: 87 | 88 | - -sqrt(s) / sqrt(n_components) with probability 1 / 2s 89 | - 0 with probability 1 - 1 / s 90 | - +sqrt(s) / sqrt(n_components) with probability 1 / 2s 91 | 92 | Parameters 93 | ---------- 94 | n_components : int or 'auto', optional (default = 'auto') 95 | Dimensionality of the target projection space. 96 | 97 | n_components can be automatically adjusted according to the 98 | number of samples in the dataset and the bound given by the 99 | Johnson-Lindenstrauss lemma. In that case the quality of the 100 | embedding is controlled by the ``eps`` parameter. 101 | 102 | It should be noted that Johnson-Lindenstrauss lemma can yield 103 | very conservative estimated of the required number of components 104 | as it makes no assumption on the structure of the dataset. 105 | 106 | eps : strictly positive float, optional (default=0.1) 107 | Parameter to control the quality of the embedding according to 108 | the Johnson-Lindenstrauss lemma when n_components is set to 109 | 'auto'. 110 | 111 | Smaller values lead to better embedding and higher number of 112 | dimensions (n_components) in the target projection space. 113 | 114 | dense_output : boolean, optional (default=False) 115 | If True, ensure that the output of the random projection is a 116 | dense numpy array even if the input and random projection matrix 117 | are both sparse. In practice, if the number of components is 118 | small the number of zero components in the projected data will 119 | be very small and it will be more CPU and memory efficient to 120 | use a dense representation. 121 | 122 | If False, the projected data uses a sparse representation if 123 | the input is sparse. 124 | 125 | random_state : integer, RandomState instance or None (default=None) 126 | Control the pseudo random number generator used to generate the 127 | matrix at fit time. 128 | 129 | Attributes 130 | ---------- 131 | ``n_component_`` : int 132 | Concrete number of components computed when n_components="auto". 133 | 134 | ``components_`` : numpy array of shape [n_components, n_features] 135 | Random matrix used for the projection. 136 | 137 | """ 138 | def __init__(self, n_components="auto", eps=0.1, random_state=None, 139 | dense_output=False): 140 | super(AchlioptasRandomProjection, self).__init__( 141 | n_components=n_components, 142 | eps=eps, 143 | density=1. / 3, 144 | dense_output=dense_output, 145 | random_state=random_state) 146 | 147 | 148 | def subsampled_hadamard_matrix(n_components, n_features, random_state=None): 149 | """Sub-sampled hadamard matrix to have shape n_components and n_features 150 | 151 | A hadamard matrix of shape at (least n_components, n_features) is 152 | subsampled without replacement. 153 | 154 | Parameters 155 | ---------- 156 | n_components : int, 157 | Dimensionality of the target projection space. 158 | 159 | n_features : int, 160 | Dimensionality of the original source space. 161 | 162 | random_state : int, RandomState instance or None (default=None) 163 | Control the pseudo random number generator used to generate the 164 | matrix at fit time. 165 | 166 | Returns 167 | ------- 168 | components : numpy array of shape [n_components, n_features] 169 | The generated random matrix. 170 | 171 | """ 172 | if n_components <= 0: 173 | raise ValueError("n_components must be strictly positive, got %d" % 174 | n_components) 175 | if n_features <= 0: 176 | raise ValueError("n_features must be strictly positive, got %d" % 177 | n_components) 178 | 179 | random_state = check_random_state(random_state) 180 | n_hadmard_size = max(2 ** np.ceil(np.log2(x)) 181 | for x in (n_components, n_features)) 182 | 183 | row = sample_without_replacement(n_hadmard_size, n_components, 184 | random_state=random_state) 185 | col = sample_without_replacement(n_hadmard_size, n_features, 186 | random_state=random_state) 187 | hadamard_matrix = sp_hadamard(n_hadmard_size, dtype=np.float)[row][:, col] 188 | hadamard_matrix *= 1 / np.sqrt(n_components) 189 | return hadamard_matrix 190 | 191 | 192 | class SampledHadamardProjection(BaseRandomProjection): 193 | """Subsample Hadamard random projection 194 | 195 | The components of the random matrix are obtnained by subsampling the 196 | row and column of a sufficiently big Hadamard matrix. 197 | 198 | Parameters 199 | ---------- 200 | n_components : int or 'auto', optional (default = 'auto') 201 | Dimensionality of the target projection space. 202 | 203 | n_components can be automatically adjusted according to the 204 | number of samples in the dataset and the bound given by the 205 | Johnson-Lindenstrauss lemma. In that case the quality of the 206 | embedding is controlled by the ``eps`` parameter. 207 | 208 | It should be noted that Johnson-Lindenstrauss lemma can yield 209 | very conservative estimated of the required number of components 210 | as it makes no assumption on the structure of the dataset. 211 | 212 | eps : strictly positive float, optional (default=0.1) 213 | Parameter to control the quality of the embedding according to 214 | the Johnson-Lindenstrauss lemma when n_components is set to 215 | 'auto'. 216 | 217 | Smaller values lead to better embedding and higher number of 218 | dimensions (n_components) in the target projection space. 219 | 220 | random_state : integer, RandomState instance or None (default=None) 221 | Control the pseudo random number generator used to generate the 222 | matrix at fit time. 223 | 224 | Attributes 225 | ---------- 226 | ``n_component_`` : int 227 | Concrete number of components computed when n_components="auto". 228 | 229 | ``components_`` : numpy array of shape [n_components, n_features] 230 | Random matrix used for the projection. 231 | 232 | """ 233 | def __init__(self, n_components="auto", eps=0.1, random_state=None): 234 | super(SampledHadamardProjection, self).__init__( 235 | n_components=n_components, 236 | eps=eps, 237 | random_state=random_state) 238 | 239 | def _make_random_matrix(self, n_components, n_features): 240 | return subsampled_hadamard_matrix(n_components, n_features, 241 | random_state=self.random_state) 242 | 243 | 244 | def subsampled_identity_matrix(n_components, n_features, random_state=None, 245 | with_replacement=True): 246 | """Sub-sampled identity matrix to have shape n_components and n_features 247 | 248 | Parameters 249 | ---------- 250 | n_components : int, 251 | Dimensionality of the target projection space. 252 | 253 | n_features : int, 254 | Dimensionality of the original source space. 255 | 256 | random_state : int, RandomState instance or None (default=None) 257 | Control the pseudo random number generator used to generate the 258 | matrix at fit time. 259 | 260 | with_replacement : bool, 261 | Whether or not drawing components with replacements. 262 | 263 | Returns 264 | ------- 265 | components : numpy array of shape [n_components, n_features] 266 | The generated random matrix. 267 | 268 | """ 269 | 270 | if n_components <= 0: 271 | raise ValueError("n_components must be strictly positive, got %d" % 272 | n_components) 273 | if n_features <= 0: 274 | raise ValueError("n_features must be strictly positive, got %d" % 275 | n_components) 276 | 277 | rng = check_random_state(random_state) 278 | 279 | components = sparse.dia_matrix((np.ones(n_features), [0]), 280 | shape=(n_features, n_features)).tocsr() 281 | if with_replacement: 282 | mask = rng.randint(n_features, size=(n_components,)) 283 | 284 | else: 285 | mask = sample_without_replacement(n_features, n_components, 286 | random_state=rng) 287 | 288 | components = components[mask] 289 | return components * np.sqrt(1.0 * n_features / n_components) 290 | 291 | 292 | class SampledIdentityProjection(BaseRandomProjection): 293 | """Subsample identity matrix projection 294 | 295 | The components of the random matrix are obtnained by subsampling the 296 | row and column of the identity matrix. 297 | 298 | Parameters 299 | ---------- 300 | n_components : int or 'auto', optional (default = 'auto') 301 | Dimensionality of the target projection space. 302 | 303 | n_components can be automatically adjusted according to the 304 | number of samples in the dataset and the bound given by the 305 | Johnson-Lindenstrauss lemma. In that case the quality of the 306 | embedding is controlled by the ``eps`` parameter. 307 | 308 | It should be noted that Johnson-Lindenstrauss lemma can yield 309 | very conservative estimated of the required number of components 310 | as it makes no assumption on the structure of the dataset. 311 | 312 | eps : strictly positive float, optional (default=0.1) 313 | Parameter to control the quality of the embedding according to 314 | the Johnson-Lindenstrauss lemma when n_components is set to 315 | 'auto'. 316 | 317 | Smaller values lead to better embedding and higher number of 318 | dimensions (n_components) in the target projection space. 319 | 320 | Note that the JL-lemma is not appropriate for the projection of a 321 | sample identity projection. 322 | 323 | random_state : integer, RandomState instance or None (default=None) 324 | Control the pseudo random number generator used to generate the 325 | matrix at fit time. 326 | 327 | Attributes 328 | ---------- 329 | ``n_component_`` : int 330 | Concrete number of components computed when n_components="auto". 331 | 332 | ``components_`` : numpy array of shape [n_components, n_features] 333 | Random matrix used for the projection. 334 | 335 | """ 336 | def __init__(self, n_components="auto", eps=0.1, random_state=None, 337 | dense_output=False, with_replacement=True): 338 | super(SampledIdentityProjection, self).__init__( 339 | n_components=n_components, 340 | eps=eps, 341 | dense_output=dense_output, 342 | random_state=random_state) 343 | 344 | self.with_replacement = with_replacement 345 | 346 | def _make_random_matrix(self, n_components, n_features): 347 | return subsampled_identity_matrix(n_components, n_features, 348 | self.random_state, 349 | with_replacement=self.with_replacement) 350 | -------------------------------------------------------------------------------- /random_output_trees/setup.py: -------------------------------------------------------------------------------- 1 | # Authors: Arnaud Joly 2 | # 3 | # License: BSD 3 clause 4 | 5 | import os 6 | 7 | 8 | def configuration(parent_package='', top_path=None): 9 | from numpy.distutils.misc_util import Configuration 10 | import numpy 11 | 12 | libraries = [] 13 | if os.name == 'posix': 14 | libraries.append('m') 15 | 16 | config = Configuration('random_output_trees', parent_package, 17 | top_path) 18 | 19 | 20 | 21 | for module in ["_tree", "_sklearn_tree", "_sklearn_tree_utils"]: 22 | config.add_extension(module, 23 | sources=["%s.c" % module], 24 | include_dirs=[numpy.get_include()], 25 | libraries=libraries, 26 | extra_compile_args=["-O3"]) 27 | 28 | 29 | # add the test directory 30 | config.add_subpackage('tests') 31 | 32 | return config 33 | 34 | if __name__ == '__main__': 35 | from numpy.distutils.core import setup 36 | setup(**configuration(top_path='').todict()) 37 | -------------------------------------------------------------------------------- /random_output_trees/tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | 5 | from sklearn.utils.testing import with_setup 6 | from sklearn.utils.testing import assert_equal 7 | 8 | from random_output_trees.datasets import fetch_drug_interaction 9 | from random_output_trees.datasets import fetch_protein_interaction 10 | from random_output_trees._utils import skipped 11 | 12 | tmpdir = None 13 | 14 | 15 | def setup_tmpdata(): 16 | # create temporary dir 17 | global tmpdir 18 | tmpdir = tempfile.mkdtemp() 19 | os.makedirs(os.path.join(tmpdir, 'drug-protein')) 20 | 21 | 22 | def teardown_tmpdata(): 23 | # remove temporary dir 24 | if tmpdir is not None: 25 | shutil.rmtree(tmpdir) 26 | 27 | @skipped 28 | @with_setup(setup_tmpdata, teardown_tmpdata) 29 | def test_fetch_drug_protein(): 30 | dataset = fetch_drug_interaction(tmpdir) 31 | 32 | assert_equal(dataset.data.shape, (1862, 660)) 33 | assert_equal(dataset.target.shape, (1862, 1554)) 34 | assert_equal(len(dataset.feature_names), 660) 35 | assert_equal(len(dataset.target_names), 1554) 36 | 37 | dataset = fetch_protein_interaction(tmpdir) 38 | assert_equal(dataset.data.shape, (1554, 876)) 39 | assert_equal(dataset.target.shape, (1554, 1862)) 40 | assert_equal(len(dataset.feature_names), 876) 41 | -------------------------------------------------------------------------------- /random_output_trees/tests/test_random_projection.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import numpy as np 4 | from scipy.sparse import issparse, coo_matrix, csr_matrix 5 | from sklearn.utils.testing import assert_raises 6 | from sklearn.utils.testing import assert_equal 7 | from sklearn.utils.testing import assert_array_almost_equal 8 | from sklearn.utils.testing import assert_almost_equal 9 | from sklearn.utils.testing import assert_raise_message 10 | from sklearn.utils.testing import assert_array_equal 11 | from sklearn.utils.testing import assert_warns 12 | 13 | from random_output_trees.random_projection import RademacherRandomProjection 14 | from random_output_trees.random_projection import AchlioptasRandomProjection 15 | from random_output_trees.random_projection import SampledHadamardProjection 16 | from random_output_trees.random_projection import SampledIdentityProjection 17 | 18 | from random_output_trees.random_projection import subsampled_hadamard_matrix 19 | from random_output_trees.random_projection import subsampled_identity_matrix 20 | 21 | 22 | RANDOM_PROJECTION = { 23 | "RademacherRandomProjection": RademacherRandomProjection, 24 | "AchlioptasRandomProjection": AchlioptasRandomProjection, 25 | "SampledHadamardProjection": SampledHadamardProjection, 26 | "SampledIdentityProjection": SampledIdentityProjection, 27 | "SampledIdentityProjection_without_replacement": 28 | partial(SampledIdentityProjection, with_replacement=False) 29 | } 30 | 31 | all_random_matrix = { 32 | "subsample_hadamard_matrix": subsampled_hadamard_matrix, 33 | "random_subsample_normalized": subsampled_identity_matrix, 34 | } 35 | 36 | def make_sparse_random_data(n_samples, n_features, n_nonzeros): 37 | rng = np.random.RandomState(0) 38 | data_coo = coo_matrix( 39 | (rng.randn(n_nonzeros), 40 | (rng.randint(n_samples, size=n_nonzeros), 41 | rng.randint(n_features, size=n_nonzeros))), 42 | shape=(n_samples, n_features)) 43 | return data_coo.toarray(), data_coo.tocsr() 44 | 45 | n_samples, n_features = (10, 1000) 46 | n_nonzeros = int(n_samples * n_features / 100.) 47 | data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros) 48 | 49 | def densify(matrix): 50 | if not issparse(matrix): 51 | return matrix 52 | else: 53 | return matrix.toarray() 54 | 55 | def check_random_projection(name): 56 | RandomProjection = RANDOM_PROJECTION[name] 57 | 58 | # Invalid input 59 | assert_raises(ValueError, RandomProjection(n_components='auto').fit, 60 | [0, 1, 2]) 61 | assert_raises(ValueError, RandomProjection(n_components=-10).fit, data) 62 | 63 | # Try to transform before fit 64 | assert_raises(ValueError, RandomProjection(n_components='auto').transform, 65 | data) 66 | 67 | 68 | def test_too_many_samples_to_find_a_safe_embedding(): 69 | data, _ = make_sparse_random_data(1000, 100, 1000) 70 | 71 | for name, RandomProjection in RANDOM_PROJECTION.items(): 72 | rp = RandomProjection(n_components='auto', eps=0.1) 73 | expected_msg = ( 74 | 'eps=0.100000 and n_samples=1000 lead to a target dimension' 75 | ' of 5920 which is larger than the original space with' 76 | ' n_features=100') 77 | assert_raise_message(ValueError, expected_msg, rp.fit, data) 78 | 79 | 80 | 81 | def test_correct_RandomProjection_dimensions_embedding(): 82 | for name, RandomProjection in RANDOM_PROJECTION.items(): 83 | rp = RandomProjection(n_components='auto', 84 | random_state=0, 85 | eps=0.5).fit(data) 86 | 87 | # the number of components is adjusted from the shape of the training 88 | # set 89 | assert_equal(rp.n_components, 'auto') 90 | assert_equal(rp.n_components_, 110) 91 | 92 | assert_equal(rp.components_.shape, (110, n_features)) 93 | 94 | projected_1 = rp.transform(data) 95 | assert_equal(projected_1.shape, (n_samples, 110)) 96 | 97 | # once the RP is 'fitted' the projection is always the same 98 | projected_2 = rp.transform(data) 99 | assert_array_equal(projected_1, projected_2) 100 | 101 | # fit transform with same random seed will lead to the same results 102 | rp2 = RandomProjection(random_state=0, eps=0.5) 103 | projected_3 = rp2.fit_transform(data) 104 | assert_array_equal(projected_1, projected_3) 105 | 106 | # Try to transform with an input X of size different from fitted. 107 | assert_raises(ValueError, rp.transform, data[:, 1:5]) 108 | 109 | 110 | def test_works_with_sparse_data(): 111 | n_features = 20 112 | data, _ = make_sparse_random_data(5, n_features, int(n_features / 4)) 113 | 114 | for name, RandomProjection in RANDOM_PROJECTION.items(): 115 | rp_dense = RandomProjection(n_components=3, 116 | random_state=1).fit(data) 117 | rp_sparse = RandomProjection(n_components=3, 118 | random_state=1).fit(csr_matrix(data)) 119 | assert_array_almost_equal(densify(rp_dense.components_), 120 | densify(rp_sparse.components_)) 121 | 122 | 123 | ############################################################################### 124 | # tests random matrix generation 125 | ############################################################################### 126 | def check_input_size_random_matrix(random_matrix): 127 | assert_raises(ValueError, random_matrix, 0, 0) 128 | assert_raises(ValueError, random_matrix, -1, 1) 129 | assert_raises(ValueError, random_matrix, 1, -1) 130 | assert_raises(ValueError, random_matrix, 1, 0) 131 | assert_raises(ValueError, random_matrix, -1, 0) 132 | 133 | 134 | def check_size_generated(random_matrix): 135 | assert_equal(random_matrix(1, 5).shape, (1, 5)) 136 | assert_equal(random_matrix(5, 1).shape, (5, 1)) 137 | assert_equal(random_matrix(5, 5).shape, (5, 5)) 138 | assert_equal(random_matrix(1, 1).shape, (1, 1)) 139 | 140 | 141 | def check_zero_mean_and_unit_norm(random_matrix): 142 | # All random matrix should produce a transformation matrix 143 | # with zero mean and unit norm for each columns 144 | 145 | A = densify(random_matrix(1000, 1, random_state=0)).ravel() 146 | assert_array_almost_equal(0, np.mean(A), 3) 147 | assert_array_almost_equal(1.0, np.linalg.norm(A), 1) 148 | 149 | 150 | def check_approximate_isometry(random_matrix): 151 | A = densify(random_matrix(50, 10, 0)) 152 | assert_almost_equal(np.mean(np.diag(np.dot(A.T, A))), 1.) 153 | 154 | def test_basic_property_of_random_matrix(): 155 | """Check basic properties of random matrix generation""" 156 | for name, random_matrix in all_random_matrix.items(): 157 | print(name) 158 | 159 | check_input_size_random_matrix(random_matrix) 160 | check_size_generated(random_matrix) 161 | if name != "random_subsample_normalized": 162 | check_zero_mean_and_unit_norm(random_matrix) 163 | check_approximate_isometry(random_matrix) 164 | 165 | 166 | def test_subsampled_identity_matrix_without_repl(): 167 | random_array = subsampled_identity_matrix(100, 1000, random_state=0, 168 | with_replacement=False) 169 | assert_array_almost_equal(random_array.toarray().sum(axis=1), 170 | 3.162278 * np.ones(100)) 171 | -------------------------------------------------------------------------------- /random_output_trees/tests/test_transformer.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | from sklearn.utils.testing import assert_array_almost_equal 3 | from sklearn.utils.testing import assert_equal 4 | from sklearn.utils import check_random_state 5 | 6 | from sklearn.random_projection import GaussianRandomProjection 7 | from random_output_trees.transformer import FixedStateTransformer 8 | 9 | class IdentityProjection(BaseEstimator): 10 | 11 | def fit(self, X): 12 | return self 13 | 14 | def transform(self, X): 15 | return X 16 | 17 | 18 | def test_fixed_state_transformer(): 19 | 20 | random_state = check_random_state(0) 21 | X = random_state.rand(500, 100) 22 | 23 | # Check that setting the random_seed is equivalent to set the 24 | # random_state 25 | transf = GaussianRandomProjection(n_components=5, random_state=0) 26 | fixed_transf = FixedStateTransformer( 27 | GaussianRandomProjection(n_components=5), random_seed=0) 28 | assert_array_almost_equal(fixed_transf.fit_transform(X), 29 | transf.fit_transform(X)) 30 | 31 | # Check that set_params doesn't modify the results 32 | fixed_transf = FixedStateTransformer( 33 | GaussianRandomProjection(n_components=5, random_state=None)) 34 | 35 | fixed_transf2 = FixedStateTransformer( 36 | GaussianRandomProjection(random_state=1, n_components=5)) 37 | 38 | assert_array_almost_equal(fixed_transf.fit_transform(X), 39 | fixed_transf2.fit_transform(X)) 40 | 41 | # Check that it work when there is no random_state 42 | fixed_transf = FixedStateTransformer(IdentityProjection()) 43 | assert_array_almost_equal(fixed_transf.fit_transform(X), X) 44 | -------------------------------------------------------------------------------- /random_output_trees/tests/test_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing for the tree module. 3 | """ 4 | from functools import partial 5 | 6 | from sklearn import datasets 7 | from sklearn.cross_validation import train_test_split 8 | 9 | from sklearn.decomposition import PCA 10 | 11 | 12 | from sklearn.utils.testing import assert_array_equal 13 | from sklearn.utils.testing import assert_almost_equal 14 | from sklearn.utils.testing import assert_equal 15 | 16 | from random_output_trees.tree import DecisionTreeClassifier 17 | from random_output_trees.tree import DecisionTreeRegressor 18 | 19 | from sklearn.random_projection import GaussianRandomProjection 20 | from sklearn.base import BaseEstimator, TransformerMixin 21 | 22 | class IdentityProjection(BaseEstimator, TransformerMixin): 23 | 24 | def fit(self, X): 25 | return self 26 | 27 | def transform(self, X): 28 | return X 29 | 30 | 31 | CLF_TREES = { 32 | "DecisionTreeClassifier": DecisionTreeClassifier, 33 | "Presort-DecisionTreeClassifier": partial(DecisionTreeClassifier, 34 | splitter="presort-best"), 35 | "ExtraTreeClassifier": partial(DecisionTreeClassifier, 36 | splitter="random"), 37 | } 38 | 39 | REG_TREES = { 40 | "DecisionTreeRegressor": DecisionTreeRegressor, 41 | "Presort-DecisionTreeRegressor": partial(DecisionTreeRegressor, 42 | splitter="presort-best"), 43 | "ExtraTreeRegressor": partial(DecisionTreeRegressor, 44 | splitter="random"), 45 | } 46 | 47 | ALL_TREES = dict() 48 | ALL_TREES.update(CLF_TREES) 49 | ALL_TREES.update(REG_TREES) 50 | 51 | 52 | def test_output_transformer(): 53 | X, y = datasets.make_multilabel_classification(return_indicator=True) 54 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 55 | 56 | transformer = GaussianRandomProjection(n_components=10) 57 | for name, TreeEstimator in ALL_TREES.items(): 58 | est = TreeEstimator(random_state=0, output_transformer=transformer) 59 | est.fit(X_train, y_train) 60 | y_pred = est.predict(X_test) 61 | assert_equal(y_pred.shape, y_test.shape) 62 | 63 | 64 | def test_identity_output_transformer(): 65 | 66 | X, y = datasets.make_multilabel_classification(return_indicator=True, 67 | random_state=0) 68 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 69 | transformer = IdentityProjection() 70 | 71 | for name, TreeEstimator in ALL_TREES.items(): 72 | est = TreeEstimator(random_state=0, max_features=None) 73 | est.fit(X_train, y_train) 74 | y_pred_origin = est.predict(X_test) 75 | 76 | 77 | est_transf = TreeEstimator(random_state=0, max_features=None, 78 | output_transformer=transformer) 79 | est_transf.fit(X_train, y_train) 80 | y_pred_transformed = est_transf.predict(X_test) 81 | assert_almost_equal(y_pred_origin, y_pred_transformed, decimal=5, 82 | err_msg="failed with {0}".format(name)) 83 | 84 | 85 | def test_pca_output_transformer(): 86 | X, y = datasets.make_multilabel_classification(return_indicator=True) 87 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 88 | transformer = PCA(n_components=1) 89 | 90 | for name, TreeEstimator in ALL_TREES.items(): 91 | est_transf = TreeEstimator(random_state=0, 92 | max_features=None, 93 | output_transformer=transformer) 94 | est_transf.fit(X_train, y_train) 95 | y_pred_transformed = est_transf.predict(X_test) 96 | assert_equal(y_pred_transformed.shape, y_test.shape, 97 | msg="failed with {0}".format(name)) 98 | 99 | 100 | def test_importances_variance_equal_mse(): 101 | """Check that gini is equivalent to mse for binary output variable""" 102 | 103 | from sklearn.tree._tree import TREE_LEAF 104 | 105 | X, y = datasets.make_classification(n_samples=2000, 106 | n_features=10, 107 | n_informative=3, 108 | n_redundant=0, 109 | n_repeated=0, 110 | shuffle=False, 111 | random_state=0) 112 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 113 | 114 | 115 | var = DecisionTreeClassifier(criterion="variance", 116 | random_state=0).fit(X_train, y_train) 117 | gini = DecisionTreeClassifier(criterion="gini", 118 | random_state=0).fit(X_train, y_train) 119 | reg = DecisionTreeRegressor(criterion="mse", 120 | random_state=0).fit(X_train, y_train) 121 | 122 | gini_leaves = gini.tree_.children_left == TREE_LEAF 123 | var_leaves = var.tree_.children_left == TREE_LEAF 124 | 125 | assert_array_equal(var.tree_.feature, reg.tree_.feature) 126 | assert_almost_equal(var.feature_importances_, reg.feature_importances_) 127 | assert_array_equal(var.tree_.children_left, reg.tree_.children_left) 128 | assert_array_equal(var.tree_.children_right, reg.tree_.children_right) 129 | assert_array_equal(var.tree_.n_node_samples, reg.tree_.n_node_samples) 130 | 131 | assert_array_equal(var.tree_.feature, gini.tree_.feature) 132 | assert_almost_equal(var.feature_importances_, gini.feature_importances_) 133 | assert_array_equal(var.tree_.children_left, gini.tree_.children_left) 134 | assert_array_equal(var.tree_.children_right, gini.tree_.children_right) 135 | assert_array_equal(var.tree_.n_node_samples, gini.tree_.n_node_samples) 136 | assert_almost_equal(var.tree_.value[var_leaves], gini.tree_.value[gini_leaves]) 137 | 138 | 139 | clf = DecisionTreeClassifier(criterion="gini", random_state=0, 140 | output_transformer=IdentityProjection(), 141 | ).fit(X_train, y_train) 142 | 143 | clf_leaves = clf.tree_.children_left == TREE_LEAF 144 | assert_array_equal(clf.tree_.feature, reg.tree_.feature) 145 | assert_almost_equal(clf.feature_importances_, reg.feature_importances_) 146 | assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) 147 | assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) 148 | assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) 149 | assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) 150 | 151 | assert_array_equal(clf.tree_.feature, gini.tree_.feature) 152 | assert_almost_equal(clf.feature_importances_, gini.feature_importances_) 153 | assert_array_equal(clf.tree_.children_left, gini.tree_.children_left) 154 | assert_array_equal(clf.tree_.children_right, gini.tree_.children_right) 155 | assert_array_equal(clf.tree_.n_node_samples, gini.tree_.n_node_samples) 156 | assert_almost_equal(clf.tree_.value[clf_leaves], gini.tree_.value[gini_leaves]) 157 | -------------------------------------------------------------------------------- /random_output_trees/tests/test_validations.py: -------------------------------------------------------------------------------- 1 | """Tests for input validation functions""" 2 | 3 | import numpy as np 4 | import scipy.sparse as sp 5 | from nose.tools import assert_raises, assert_true, assert_false, assert_equal 6 | from itertools import product 7 | 8 | 9 | # from sklearn.utils.estimator_checks import NotAnArray 10 | 11 | 12 | from sklearn.neighbors import KNeighborsClassifier 13 | from sklearn.ensemble import RandomForestRegressor 14 | from sklearn.svm import SVR 15 | 16 | from random_output_trees._utils import has_fit_parameter 17 | from random_output_trees._utils import check_array 18 | 19 | 20 | def test_ordering(): 21 | """Check that ordering is enforced correctly by validation utilities. 22 | 23 | We need to check each validation utility, because a 'copy' without 24 | 'order=K' will kill the ordering. 25 | """ 26 | X = np.ones((10, 5)) 27 | for A in X, X.T: 28 | for copy in (True, False): 29 | B = check_array(A, order='C', copy=copy) 30 | assert_true(B.flags['C_CONTIGUOUS']) 31 | B = check_array(A, order='F', copy=copy) 32 | assert_true(B.flags['F_CONTIGUOUS']) 33 | if copy: 34 | assert_false(A is B) 35 | 36 | X = sp.csr_matrix(X) 37 | X.data = X.data[::-1] 38 | assert_false(X.data.flags['C_CONTIGUOUS']) 39 | 40 | for copy in (True, False): 41 | Y = check_array(X, accept_sparse='csr', copy=copy, order='C') 42 | assert_true(Y.data.flags['C_CONTIGUOUS']) 43 | 44 | 45 | def test_check_array(): 46 | # accept_sparse == None 47 | # raise error on sparse inputs 48 | X = [[1, 2], [3, 4]] 49 | X_csr = sp.csr_matrix(X) 50 | assert_raises(TypeError, check_array, X_csr) 51 | # ensure_2d 52 | X_array = check_array([0, 1, 2]) 53 | assert_equal(X_array.ndim, 2) 54 | X_array = check_array([0, 1, 2], ensure_2d=False) 55 | assert_equal(X_array.ndim, 1) 56 | # don't allow ndim > 3 57 | X_ndim = np.arange(8).reshape(2, 2, 2) 58 | assert_raises(ValueError, check_array, X_ndim) 59 | check_array(X_ndim, allow_nd=True) # doesn't raise 60 | # force_all_finite 61 | X_inf = np.arange(4).reshape(2, 2).astype(np.float) 62 | X_inf[0, 0] = np.inf 63 | assert_raises(ValueError, check_array, X_inf) 64 | check_array(X_inf, force_all_finite=False) # no raise 65 | # nan check 66 | X_nan = np.arange(4).reshape(2, 2).astype(np.float) 67 | X_nan[0, 0] = np.nan 68 | assert_raises(ValueError, check_array, X_nan) 69 | check_array(X_inf, force_all_finite=False) # no raise 70 | 71 | # dtype and order enforcement. 72 | X_C = np.arange(4).reshape(2, 2).copy("C") 73 | X_F = X_C.copy("F") 74 | X_int = X_C.astype(np.int) 75 | X_float = X_C.astype(np.float) 76 | Xs = [X_C, X_F, X_int, X_float] 77 | dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object] 78 | orders = ['C', 'F', None] 79 | copys = [True, False] 80 | 81 | for X, dtype, order, copy in product(Xs, dtypes, orders, copys): 82 | X_checked = check_array(X, dtype=dtype, order=order, copy=copy) 83 | if dtype is not None: 84 | assert_equal(X_checked.dtype, dtype) 85 | else: 86 | assert_equal(X_checked.dtype, X.dtype) 87 | if order == 'C': 88 | assert_true(X_checked.flags['C_CONTIGUOUS']) 89 | assert_false(X_checked.flags['F_CONTIGUOUS']) 90 | elif order == 'F': 91 | assert_true(X_checked.flags['F_CONTIGUOUS']) 92 | assert_false(X_checked.flags['C_CONTIGUOUS']) 93 | if copy: 94 | assert_false(X is X_checked) 95 | else: 96 | # doesn't copy if it was already good 97 | if (X.dtype == X_checked.dtype and 98 | X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] 99 | and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): 100 | assert_true(X is X_checked) 101 | 102 | # allowed sparse != None 103 | X_csc = sp.csc_matrix(X_C) 104 | X_coo = X_csc.tocoo() 105 | X_dok = X_csc.todok() 106 | X_int = X_csc.astype(np.int) 107 | X_float = X_csc.astype(np.float) 108 | 109 | Xs = [X_csc, X_coo, X_dok, X_int, X_float] 110 | accept_sparses = [['csr', 'coo'], ['coo', 'dok']] 111 | for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, 112 | copys): 113 | X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, 114 | copy=copy) 115 | if dtype is not None: 116 | assert_equal(X_checked.dtype, dtype) 117 | else: 118 | assert_equal(X_checked.dtype, X.dtype) 119 | if X.format in accept_sparse: 120 | # no change if allowed 121 | assert_equal(X.format, X_checked.format) 122 | else: 123 | # got converted 124 | assert_equal(X_checked.format, accept_sparse[0]) 125 | if copy: 126 | assert_false(X is X_checked) 127 | else: 128 | # doesn't copy if it was already good 129 | if (X.dtype == X_checked.dtype and X.format == X_checked.format): 130 | assert_true(X is X_checked) 131 | 132 | # other input formats 133 | # convert lists to arrays 134 | X_dense = check_array([[1, 2], [3, 4]]) 135 | assert_true(isinstance(X_dense, np.ndarray)) 136 | # raise on too deep lists 137 | assert_raises(ValueError, check_array, X_ndim.tolist()) 138 | check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise 139 | # convert weird stuff to arrays 140 | # X_no_array = NotAnArray(X_dense) 141 | # result = check_array(X_no_array) 142 | # assert_true(isinstance(result, np.ndarray)) 143 | 144 | def test_has_fit_parameter(): 145 | assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) 146 | assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) 147 | assert_true(has_fit_parameter(SVR, "sample_weight")) 148 | assert_true(has_fit_parameter(SVR(), "sample_weight")) 149 | -------------------------------------------------------------------------------- /random_output_trees/transformer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module provides general purpose meta-transformer. 3 | 4 | ''' 5 | 6 | # Authors: Arnaud Joly 7 | # 8 | # License: BSD 3 clause 9 | 10 | from sklearn.base import BaseEstimator 11 | from sklearn.base import clone 12 | from sklearn.base import TransformerMixin 13 | from sklearn.utils import check_random_state 14 | 15 | 16 | class FixedStateTransformer(BaseEstimator, TransformerMixin): 17 | """Fixe the random_state of the transformer 18 | 19 | This meta-transformer is usefull when you want to fix the random_state 20 | of a transformer, which is modified by some meta-estimator. 21 | 22 | Parameters 23 | ---------- 24 | transformer : scikit-learn transformer 25 | 26 | random_seed : int, RandomState instance, optional (default=0) 27 | If int, random_state is the seed used by the random number generator; 28 | If RandomState instance, random_state is the random number generator; 29 | 30 | Attributes 31 | ---------- 32 | transformer_ : transformer 33 | A clone of the fitted transformer 34 | 35 | """ 36 | def __init__(self, transformer, random_seed=0): 37 | self.transformer = transformer 38 | self.random_seed = random_seed 39 | 40 | self.transformer_ = None 41 | 42 | @property 43 | def random_state(self): 44 | return self.random_seed 45 | 46 | def fit(self, X, y=None): 47 | """Fit estimator. 48 | 49 | Parameters 50 | ---------- 51 | X : array-like, shape=(n_samples, n_features) 52 | Input data used to build forests. 53 | 54 | Returns 55 | ------- 56 | self : object 57 | Returns self. 58 | """ 59 | random_state = check_random_state(self.random_seed) 60 | self.transformer_ = clone(self.transformer) 61 | 62 | try: 63 | self.transformer_.set_params(random_state=random_state) 64 | except ValueError: 65 | pass 66 | 67 | try: 68 | self.transformer_.fit(X, y) 69 | except TypeError: 70 | self.transformer_.fit(X) 71 | 72 | return self 73 | 74 | def transform(self, X): 75 | """Transform dataset. 76 | 77 | Parameters 78 | ---------- 79 | X : array-like, shape=(n_samples, n_features) 80 | Input data to be transformed. 81 | 82 | Returns 83 | ------- 84 | X_transformed: sparse matrix, shape=(n_samples, n_out) 85 | Transformed dataset. 86 | """ 87 | return self.transformer_.transform(X) 88 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | # nosetests skips test files with the executable bit by default 3 | # which can silently hide failing tests. 4 | # There are no executable scripts within the scikit-learn project 5 | # so let's turn the --exe flag on to avoid skipping tests by 6 | # mistake. 7 | exe = 1 8 | cover-html = 1 9 | cover-html-dir = coverage 10 | cover-package = random_output_trees 11 | 12 | detailed-errors = 1 13 | with-doctest = 1 14 | doctest-tests = 1 15 | doctest-extension = rst 16 | doctest-fixtures = _fixture 17 | #doctest-options = +ELLIPSIS,+NORMALIZE_WHITESPACE 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # 3 | # Author : Arnaud Joly 4 | # 5 | # License: BSD 3 clause 6 | 7 | import sys 8 | import os 9 | import shutil 10 | from distutils.command.clean import clean as Clean 11 | 12 | DISTNAME = 'random-output-trees' 13 | DESCRIPTION = "High dimension output tree classifier and regressor" 14 | LONG_DESCRIPTION = open('README.rst').read() 15 | MAINTAINER = 'Arnaud Joly' 16 | MAINTAINER_EMAIL = 'arnaud.v.joly@gmail.com' 17 | URL = 'http://arjoly.github.io/random-output-trees/' 18 | LICENSE = 'BSD' 19 | DOWNLOAD_URL = 'https://github.com/arjoly/random-output-trees/archive/master.zip' 20 | CLASSIFIERS = [ 21 | 'Intended Audience :: Science/Research', 22 | 'Intended Audience :: Developers', 23 | 'License :: OSI Approved', 24 | 'Programming Language :: C', 25 | 'Programming Language :: Python', 26 | 'Topic :: Software Development', 27 | 'Topic :: Scientific/Engineering', 28 | 'Operating System :: Microsoft :: Windows', 29 | 'Operating System :: POSIX', 30 | 'Operating System :: Unix', 31 | 'Operating System :: MacOS' 32 | ] 33 | 34 | import random_output_trees 35 | VERSION = random_output_trees.__version__ 36 | 37 | import setuptools # we are using a setuptools namespace 38 | from numpy.distutils.core import setup 39 | 40 | class CleanCommand(Clean): 41 | description = "Remove build directories, and compiled file in the source tree" 42 | 43 | def run(self): 44 | Clean.run(self) 45 | if os.path.exists('build'): 46 | shutil.rmtree('build') 47 | for dirpath, dirnames, filenames in os.walk('random_output_trees'): 48 | for filename in filenames: 49 | if (filename.endswith('.so') or filename.endswith('.pyd') 50 | or filename.endswith('.dll') 51 | or filename.endswith('.pyc')): 52 | os.unlink(os.path.join(dirpath, filename)) 53 | for dirname in dirnames: 54 | if dirname == '__pycache__': 55 | shutil.rmtree(os.path.join(dirpath, dirname)) 56 | 57 | 58 | def configuration(parent_package='', top_path=None): 59 | if os.path.exists('MANIFEST'): 60 | os.remove('MANIFEST') 61 | 62 | from numpy.distutils.misc_util import Configuration 63 | config = Configuration(None, parent_package, top_path) 64 | 65 | config.add_subpackage('random_output_trees') 66 | 67 | return config 68 | 69 | if __name__ == "__main__": 70 | 71 | old_path = os.getcwd() 72 | local_path = os.path.dirname(os.path.abspath(sys.argv[0])) 73 | 74 | os.chdir(local_path) 75 | sys.path.insert(0, local_path) 76 | 77 | setup(configuration=configuration, 78 | name=DISTNAME, 79 | maintainer=MAINTAINER, 80 | include_package_data=True, 81 | maintainer_email=MAINTAINER_EMAIL, 82 | description=DESCRIPTION, 83 | license=LICENSE, 84 | url=URL, 85 | version=VERSION, 86 | download_url=DOWNLOAD_URL, 87 | long_description=LONG_DESCRIPTION, 88 | zip_safe=False, # the package can run out of an .egg file 89 | classifiers=CLASSIFIERS, 90 | cmdclass={'clean': CleanCommand}, 91 | ) 92 | --------------------------------------------------------------------------------