├── .gitignore
├── AUTHORS.rst
├── COPYING
├── Makefile
├── README.rst
├── doc
    ├── .gitignore
    ├── Makefile
    ├── README
    ├── classification.rst
    ├── conf.py
    ├── data
    │   ├── sdss_colors
    │   │   ├── fetch_data.py
    │   │   └── scatter_colors.py
    │   ├── sdss_photoz
    │   │   └── fetch_data.py
    │   └── sdss_spectra
    │   │   └── fetch_data.py
    ├── dimensionality_reduction.rst
    ├── exercises.rst
    ├── general_concepts.rst
    ├── images
    │   └── blank_image.png
    ├── includes
    │   ├── big_toc_css.rst
    │   └── bigger_toc_css.rst
    ├── index.rst
    ├── logos
    │   ├── favicon.ico
    │   ├── identity.pdf
    │   ├── scikit-learn-logo-small.png
    │   ├── scikit-learn-logo-thumb.png
    │   ├── scikit-learn-logo.bmp
    │   ├── scikit-learn-logo.png
    │   └── scikit-learn-logo.svg
    ├── make.bat
    ├── notebooks
    │   ├── .gitignore
    │   ├── 01_datasets.ipynb
    │   ├── 02_iris_classification.ipynb
    │   ├── 03_iris_dimensionality.ipynb
    │   ├── 04_iris_clustering.ipynb
    │   ├── 05_iris_crossval.ipynb
    │   ├── 06_learning_curves.ipynb
    │   ├── 07_classification_example.ipynb
    │   ├── 08_regression_example.ipynb
    │   ├── 09_dimensionality_example.ipynb
    │   ├── 10_exercise01.ipynb
    │   ├── 11_exercise02.ipynb
    │   ├── 12_exercise03.ipynb
    │   ├── nbconvert.py
    │   └── soln
    │   │   ├── 01-01.py
    │   │   ├── 01-02.py
    │   │   ├── 01-03.py
    │   │   ├── 01-04.py
    │   │   ├── 01-05.py
    │   │   ├── 02-01.py
    │   │   ├── 02-02.py
    │   │   ├── 02-03a.py
    │   │   ├── 02-03b.py
    │   │   ├── 03-01.py
    │   │   ├── 03-02.py
    │   │   └── 03-03.py
    ├── practical.rst
    ├── regression.rst
    ├── scikitlearn.png
    ├── setup.rst
    ├── skeletons
    │   ├── exercise_01.py
    │   ├── exercise_02.py
    │   └── exercise_03.py
    ├── solutions
    │   ├── exercise_01.py
    │   ├── exercise_02.py
    │   ├── exercise_03.py
    │   └── generate_skeletons.py
    ├── sphinxext
    │   ├── LICENSE.txt
    │   ├── MANIFEST.in
    │   ├── README.txt
    │   ├── gen_rst.py
    │   ├── numpy_ext
    │   │   ├── __init__.py
    │   │   ├── docscrape.py
    │   │   ├── docscrape_sphinx.py
    │   │   └── numpydoc.py
    │   └── numpy_ext_old
    │   │   ├── __init__.py
    │   │   ├── docscrape.py
    │   │   ├── docscrape_sphinx.py
    │   │   └── numpydoc.py
    ├── templates
    │   ├── class.rst
    │   └── function.rst
    └── themes
    │   └── scikit-learn
    │       ├── layout.html
    │       ├── static
    │           ├── jquery.js
    │           ├── nature.css_t
    │           └── sidebar.js
    │       └── theme.conf
└── examples
    ├── README.txt
    ├── plot_ML_flow_chart.py
    ├── plot_bias_variance_examples.py
    ├── plot_gui_example.py
    ├── plot_iris_projections.py
    ├── plot_python_101.py
    ├── plot_sdss_filters.py
    ├── plot_sdss_images.py
    ├── plot_sdss_photoz.py
    ├── plot_sdss_specPCA.py
    ├── plot_sgd_separating_hyperplane.py
    └── svm_gui.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | *~
 4 | .#*
 5 | *.swp
 6 | *.swo
 7 | .DS_Store
 8 | build
 9 | 
10 | dist/
11 | doc/.nojekyll
12 | doc/_build/
13 | doc/auto_examples/
14 | doc/modules/generated/
15 | doc/datasets/generated/
16 | pip-log.txt
17 | .coverage
18 | coverage
19 | tags
20 | 
21 | examples/downloads/
22 | 
23 | *.zip
24 | *.nt.bz2
25 | *.tar.gz
26 | *.tgz
27 | *.npz
28 | *.npy


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | .. -*- mode: rst -*-
 2 | 
 3 | People
 4 | ------
 5 | 
 6 | This tutorial is brought to you by the `scikit-learn
 7 | <http://scikit-learn.org/>`_ folks, in particular:
 8 | 
 9 | .. hlist::
10 | 
11 |   * `Jake Vanderplas <http://www.astro.washington.edu/vanderplas>`_
12 |   * Olivier Grisel
13 |   * Jaques Grobler
14 |   * `Gael Varoquaux <http://gael-varoquaux.info/blog/>`_
15 | 
16 | .. _citing:
17 | 
18 | Citing the scikit-learn
19 | ------------------------
20 | 
21 | A huge amount of work goes in the scikit-learn. Researchers that invest
22 | their time in developing and maintaining the package deserve recognition
23 | with citations. In addition, the Parietal team needs the citations to the
24 | paper in order to justify paying a software engineer on the project. To
25 | garanty the future of the toolkit, if you use it, please cite it.
26 | 
27 | See the scikit-learn documentation on `how to cite
28 | <http://scikit-learn.org/stable/about.html#citing-scikit-learn>`_.
29 | 
30 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | New BSD License
 2 | 
 3 | Copyright (c) 2007 - 2012 The scikit-learn developers.
 4 | All rights reserved.
 5 | 
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 |   a. Redistributions of source code must retain the above copyright notice,
11 |      this list of conditions and the following disclaimer.
12 |   b. Redistributions in binary form must reproduce the above copyright
13 |      notice, this list of conditions and the following disclaimer in the
14 |      documentation and/or other materials provided with the distribution.
15 |   c. Neither the name of the Scikit-learn Developers  nor the names of
16 |      its contributors may be used to endorse or promote products
17 |      derived from this software without specific prior written
18 |      permission. 
19 | 
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 | DAMAGE.
32 | 
33 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # simple makefile to simplify repetetive build env management tasks under posix
 2 | 
 3 | # caution: testing won't work on windows, see README
 4 | 
 5 | PYTHON ?= python
 6 | CYTHON ?= cython
 7 | NOSETESTS ?= nosetests
 8 | CTAGS ?= ctags
 9 | 
10 | all: clean doc-noplot
11 | 
12 | clean-pyc:
13 | 	find . -name "*.pyc" | xargs rm -f
14 | 
15 | clean-so:
16 | 	find . -name "*.so" | xargs rm -f
17 | 	find . -name "*.pyd" | xargs rm -f
18 | 
19 | clean-build:
20 | 	rm -rf build
21 | 
22 | clean-ctags:
23 | 	rm -f tags
24 | 
25 | clean: clean-build clean-pyc clean-so clean-ctags
26 | 
27 | in: inplace # just a shortcut
28 | inplace:
29 | 	$(PYTHON) setup.py build_ext -i
30 | 
31 | test-doc:
32 | 	$(NOSETESTS) -s --with-doctest --doctest-tests --doctest-extension=rst \
33 | 	--doctest-extension=inc --doctest-fixtures=_fixture doc/ \
34 | 
35 | test: test-doc
36 | 
37 | trailing-spaces:
38 | 	find . -name "*.py" | xargs perl -pi -e 's/[ \t]*$$//'
39 | 
40 | cython:
41 | 	find -name "*.pyx" | xargs $(CYTHON)
42 | 
43 | ctags:
44 | 	# make tags for symbol based navigation in emacs and vim
45 | 	# Install with: sudo apt-get install exuberant-ctags
46 | 	$(CTAGS) -R *
47 | 
48 | .PHONY : doc
49 | doc:
50 | 	make -C doc html
51 | 
52 | .PHONY : doc-noplot
53 | doc-noplot:
54 | 	make -C doc html-noplot
55 | 
56 | .PHONY : pdf
57 | pdf:
58 | 	make -C doc pdf
59 | 
60 | install: 
61 | 	cd doc; make install
62 |  
63 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. -*- mode: rst -*-
 2 | 
 3 | Machine Learning for Astronomical Data Analysis
 4 | =================================================
 5 | 
 6 | **Note: this content is extremely out-of-date, and I would not recommend using it**
 7 | 
 8 | If you would like a more up-to-date machine learning tutorial that grew from this
 9 | content, I'd recommend the [Python Data Science Handbook](http://github.com/jakevdp/PythonDataScienceHandbook).
10 | 


--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | AUTHORS.rst


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | 
 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex latexpdf chan
 16 | 
 17 | all: html-noplot
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html      to make standalone HTML files"
 22 | 	@echo "  dirhtml   to make HTML files named index.html in directories"
 23 | 	@echo "  pickle    to make pickle files"
 24 | 	@echo "  json      to make JSON files"
 25 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
 26 | 	@echo "  qthelp    to make HTML files and a qthelp project"
 27 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 28 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 29 | 	@echo "  changes   to make an overview of all changed/added/deprecated items"
 30 | 	@echo "  linkcheck to check all external links for integrity"
 31 | 	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
 32 | 
 33 | clean:
 34 | 	-rm -rf $(BUILDDIR)/*
 35 | 	-rm -rf auto_examples/
 36 | 	-rm -rf doc/generated/*
 37 | 	-rm -rf modules/generated/*
 38 | 
 39 | html:
 40 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 41 | 	touch $(BUILDDIR)/html .nojekyll
 42 | 	@echo
 43 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 44 | 
 45 | html-noplot:
 46 | 	$(SPHINXBUILD) -D plot_gallery=False -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 47 | 	touch $(BUILDDIR)/html .nojekyll
 48 | 	@echo
 49 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 50 | 
 51 | dirhtml:
 52 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 53 | 	touch $(BUILDDIR)/dirhtml .nojekyll
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 56 | 
 57 | pickle:
 58 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 59 | 	@echo
 60 | 	@echo "Build finished; now you can process the pickle files."
 61 | 
 62 | json:
 63 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 64 | 	@echo
 65 | 	@echo "Build finished; now you can process the JSON files."
 66 | 
 67 | htmlhelp:
 68 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 69 | 	@echo
 70 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 71 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 72 | 
 73 | latex:
 74 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 75 | 	@echo
 76 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 77 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 78 | 	      "(use \`make latexpdf' here to do that automatically)."
 79 | 
 80 | latexpdf:
 81 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 82 | 	@echo "Running LaTeX files through pdflatex..."
 83 | 	make -C $(BUILDDIR)/latex all-pdf
 84 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 85 | 
 86 | changes:
 87 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
 88 | 	@echo
 89 | 	@echo "The overview file is in $(BUILDDIR)/changes."
 90 | 
 91 | linkcheck:
 92 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
 93 | 	@echo
 94 | 	@echo "Link check complete; look for any errors in the above output " \
 95 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
 96 | 
 97 | doctest:
 98 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
 99 | 	@echo "Testing of doctests in the sources finished, look at the " \
100 | 	      "results in $(BUILDDIR)/doctest/output.txt."
101 | 
102 | #zip: html pdf
103 | #	mkdir -p _build/nisl ;
104 | #	cp -r _build/html _build/nisl ;
105 | #	cp -r data _build/nisl ;
106 | #	cp nisl.pdf _build/nisl;
107 | #	zip -r _build/nisl.zip _build/nisl
108 | 
109 | download-data:
110 | 	cd data/sdss_colors && python fetch_data.py
111 | 	cd data/sdss_colors && python fetch_data.py
112 | 	cd data/sdss_colors && python fetch_data.py
113 | 
114 | pdf: latexpdf
115 | 	cp $(BUILDDIR)/latex/sklearn_tutorial.pdf ./
116 | 
117 | nbconvert:
118 | 	cd notebooks && rm -f *.v2.ipynb && python nbconvert.py *.ipynb
119 | 
120 | tar: nbconvert
121 | 	tar -czvf exercises.tgz notebooks/soln/*.py notebooks/*.ipynb data/*/*.py skeletons solutions
122 | 
123 | tar-data: nbconvert download-data
124 | 	tar -czvf exercises_data.tgz notebooks/soln/*.py notebooks/*.ipynb data skeletons solutions
125 | 
126 | install-reclone: pdf tar html
127 | 	rm -rf _build/sklearn_tutorial
128 | 	cd _build/ && \
129 |         git clone git@github.com:astroML/sklearn_tutorial && \
130 | 	cd sklearn_tutorial && git checkout gh-pages && \
131 | 	rsync -r ../html/* . && \
132 | 	git add * && \
133 | 	git commit -a -m 'Make install' && \
134 | 	git push origin gh-pages
135 | 
136 | install: pdf tar html
137 | 	if test -d _build/sklearn_tutorial; \
138 | 	then echo "using existing sklearn_tutorial directory"; \
139 | 	else cd _build && \
140 | 	git clone git@github.com:astroML/sklearn_tutorial; \
141 | 	fi && \
142 | 	cd _build/sklearn_tutorial && git checkout gh-pages && \
143 | 	rsync -r ../html/* ./ && \
144 | 	git add * && \
145 | 	git commit -a -m 'Make install' && \
146 | 	git push origin gh-pages
147 | 


--------------------------------------------------------------------------------
/doc/README:
--------------------------------------------------------------------------------
 1 | Documentation 
 2 | ----------------------
 3 | 
 4 | This section contains the full manual and web page as displayed on
 5 | the web. To generate the full web page, including
 6 | the example gallery (this might take a while):
 7 | 
 8 |     make html
 9 | 
10 | Or, if you'd rather not build the example gallery:
11 | 
12 |     make html-noplot
13 | 
14 | That should create all the doc in directory _build/html
15 | 
16 | To build the PDF manual, run
17 | 
18 |     make latexpdf
19 | 
20 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # scikit-learn documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Jan  8 09:13:42 2010.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing
  7 | # dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | import shutil
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another
 20 | # directory, add these directories to sys.path here. If the directory
 21 | # is relative to the documentation root, use os.path.abspath to make it
 22 | # absolute, like shown here.
 23 | sys.path.insert(0, os.path.abspath('sphinxext'))
 24 | 
 25 | try:
 26 |     shutil.copy('../AUTHORS.rst', '.')
 27 | except IOError:
 28 |     # When nose scans this file, it is not in the right working
 29 |     # directory, and thus the line above fails
 30 |     pass
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # Try to override the matplotlib configuration as early as possible
 35 | try:
 36 |     import gen_rst
 37 | except:
 38 |     pass
 39 | 
 40 | # Add any Sphinx extension module names here, as strings. They can be
 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 42 | extensions = ['gen_rst',
 43 |               'sphinx.ext.autodoc', #'sphinx.ext.autosummary',
 44 |               'sphinx.ext.pngmath', 'sphinx.ext.intersphinx',
 45 |               ]
 46 | try:
 47 |     import numpy_ext.numpydoc
 48 |     extensions.append('numpy_ext.numpydoc')
 49 |     # With older versions of sphinx, this causes a crash
 50 |     autosummary_generate = True
 51 | except:
 52 |     # Older version of sphinx
 53 |     extensions.append('numpy_ext_old.numpydoc')
 54 | 
 55 | autodoc_default_flags = ['members', 'inherited-members']
 56 | 
 57 | # Add any paths that contain templates here, relative to this directory.
 58 | templates_path = ['templates']
 59 | 
 60 | # generate autosummary even if no references
 61 | autosummary_generate = True
 62 | 
 63 | # The suffix of source filenames.
 64 | source_suffix = '.rst'
 65 | 
 66 | # The encoding of source files.
 67 | #source_encoding = 'utf-8'
 68 | 
 69 | # Generate the plots for the gallery
 70 | plot_gallery = True
 71 | 
 72 | # The master toctree document.
 73 | master_doc = 'index'
 74 | 
 75 | # General information about the project.
 76 | project = u'AtroML'
 77 | copyright = u'scikit-learn developers'
 78 | 
 79 | # The version info for the project you're documenting, acts as replacement for
 80 | # |version| and |release|, also used in various other places throughout the
 81 | # built documents.
 82 | #
 83 | # The short X.Y version.
 84 | version = ''
 85 | # The full version, including alpha/beta/rc tags.
 86 | release = "Scipy2012"
 87 | 
 88 | # The language for content autogenerated by Sphinx. Refer to documentation
 89 | # for a list of supported languages.
 90 | language = 'en'
 91 | 
 92 | # There are two options for replacing |today|: either, you set today to some
 93 | # non-false value, then it is used:
 94 | #today = ''
 95 | # Else, today_fmt is used as the format for a strftime call.
 96 | #today_fmt = '%B %d, %Y'
 97 | 
 98 | # List of documents that shouldn't be included in the build.
 99 | #unused_docs = []
100 | 
101 | # List of directories, relative to source directory, that shouldn't be
102 | # searched for source files.
103 | exclude_trees = ['_build', 'templates', 'includes']
104 | 
105 | # The reST default role (used for this markup: `text`) to use for all
106 | # documents.
107 | #default_role = None
108 | 
109 | # If true, '()' will be appended to :func: etc. cross-reference text.
110 | add_function_parentheses = False
111 | 
112 | # If true, the current module name will be prepended to all description
113 | # unit titles (such as .. function::).
114 | #add_module_names = True
115 | 
116 | # If true, sectionauthor and moduleauthor directives will be shown in the
117 | # output. They are ignored by default.
118 | #show_authors = False
119 | 
120 | # The name of the Pygments (syntax highlighting) style to use.
121 | pygments_style = 'sphinx'
122 | 
123 | # A list of ignored prefixes for module index sorting.
124 | #modindex_common_prefix = []
125 | 
126 | 
127 | # -- Options for HTML output -------------------------------------------------
128 | 
129 | # The theme to use for HTML and HTML Help pages.  Major themes that come with
130 | # Sphinx are currently 'default' and 'sphinxdoc'.
131 | html_theme = 'scikit-learn'
132 | 
133 | # Theme options are theme-specific and customize the look and feel of a theme
134 | # further.  For a list of options available for each theme, see the
135 | # documentation.
136 | html_theme_options = {'oldversion':False, 'collapsiblesidebar': True}
137 | 
138 | # Add any paths that contain custom themes here, relative to this directory.
139 | html_theme_path = ['themes']
140 | 
141 | 
142 | # The name for this set of Sphinx documents.  If None, it defaults to
143 | # "<project> v<release> documentation".
144 | html_title = "Machine Learning for Astronomy with Scikit-learn"
145 | 
146 | # A shorter title for the navigation bar.  Default is the same as html_title.
147 | html_short_title = 'Scikit-learn Astronomy Tutorial'
148 | 
149 | # The name of an image file (relative to this directory) to place at the top
150 | # of the sidebar.
151 | html_logo = 'logos/scikit-learn-logo-small.png'
152 | 
153 | # The name of an image file (within the static path) to use as favicon of the
154 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
155 | # pixels large.
156 | html_favicon = 'logos/favicon.ico'
157 | 
158 | # Add any paths that contain custom static files (such as style sheets) here,
159 | # relative to this directory. They are copied after the builtin static files,
160 | # so a file named "default.css" will overwrite the builtin "default.css".
161 | html_static_path = ['images']
162 | 
163 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
164 | # using the given strftime format.
165 | #html_last_updated_fmt = '%b %d, %Y'
166 | 
167 | # If true, SmartyPants will be used to convert quotes and dashes to
168 | # typographically correct entities.
169 | #html_use_smartypants = True
170 | 
171 | # Custom sidebar templates, maps document names to template names.
172 | #html_sidebars = {}
173 | 
174 | # Additional templates that should be rendered to pages, maps page names to
175 | # template names.
176 | #html_additional_pages = {}
177 | 
178 | # If false, no module index is generated.
179 | html_use_modindex = False
180 | 
181 | # If false, no index is generated.
182 | html_use_index = False
183 | 
184 | # If true, the index is split into individual pages for each letter.
185 | #html_split_index = False
186 | 
187 | # If true, links to the reST sources are added to the pages.
188 | #html_show_sourcelink = True
189 | 
190 | # If true, an OpenSearch description file will be output, and all pages will
191 | # contain a <link> tag referring to it.  The value of this option must be the
192 | # base URL from which the finished HTML is served.
193 | #html_use_opensearch = ''
194 | 
195 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
196 | #html_file_suffix = ''
197 | 
198 | # Output file base name for HTML help builder.
199 | htmlhelp_basename = 'PythonScientic'
200 | 
201 | 
202 | # -- Options for LaTeX output ------------------------------------------------
203 | 
204 | # The paper size ('letter' or 'a4').
205 | #latex_paper_size = 'letter'
206 | 
207 | # The font size ('10pt', '11pt' or '12pt').
208 | #latex_font_size = '10pt'
209 | 
210 | # Grouping the document tree into LaTeX files. List of tuples
211 | # (source start file, target name, title, author, documentclass
212 | # [howto/manual]).
213 | latex_documents = [
214 |   ('index', 'sklearn_tutorial.tex', u'Astronomy with scikit-learn',
215 |    ur"""Jacob VanderPlas"""
216 |    + r"\\\relax ~\\\relax http://astroML.github.com/sklearn\_tutorial/",
217 |    'manual'),
218 | ]
219 | 
220 | # The name of an image file (relative to this directory) to place at the top of
221 | # the title page.
222 | latex_logo = "logos/scikit-learn-logo.png"
223 | 
224 | # For "manual" documents, if this is true, then toplevel headings are parts,
225 | # not chapters.
226 | #latex_use_parts = False
227 | 
228 | # Additional stuff for the LaTeX preamble.
229 | latex_preamble = r"""
230 | \usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm}\usepackage{morefloats}
231 | \let\oldfootnote\footnote
232 | \def\footnote#1{\oldfootnote{\small #1}}
233 | """
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #latex_appendices = []
237 | latex_elements = {
238 |   'classoptions': ',oneside',
239 |   'babel': '\\usepackage[english]{babel}',
240 |   # Get completely rid of index
241 |   'printindex': '',
242 | }
243 | 
244 | # If false, no module index is generated.
245 | latex_use_modindex = False
246 | latex_domain_indices = False
247 | 
248 | # Show the page numbers in the references
249 | latex_show_pagerefs = True
250 | 
251 | # Show URLs in footnotes
252 | latex_show_urls = 'footnote'
253 | 
254 | trim_doctests_flags = True
255 | 
256 | # Intersphinx mapping to the scikit-learn docs
257 | intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)}
258 | 


--------------------------------------------------------------------------------
/doc/data/sdss_colors/fetch_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib2
 3 | import numpy as np
 4 | 
 5 | DTYPE_TRAIN = [('u-g', np.float32),
 6 |                ('g-r', np.float32),
 7 |                ('r-i', np.float32),
 8 |                ('i-z', np.float32),
 9 |                ('redshift', np.float32)]
10 | 
11 | DTYPE_TEST = [('u-g', np.float32),
12 |                ('g-r', np.float32),
13 |                ('r-i', np.float32),
14 |                ('i-z', np.float32),
15 |                ('label', np.int32)]
16 | 
17 | SDSS_COLORS_URL = "http://www.astro.washington.edu/users/vanderplas/pydata/"
18 | TRAIN_FILE = 'sdssdr6_colors_class_train.dat'
19 | TEST_FILE = 'sdssdr6_colors_class.200000.dat'
20 | 
21 | # data directory is password protected so the public can't access it    
22 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
23 | password_mgr.add_password(None, SDSS_COLORS_URL, 'pydata', 'astroML')
24 | handler = urllib2.HTTPBasicAuthHandler(password_mgr)
25 | opener = urllib2.build_opener(handler)
26 | 
27 | # download training data
28 | destination = TRAIN_FILE.rstrip('.dat') + '.npy'
29 | if not os.path.exists(destination):
30 |     url = SDSS_COLORS_URL + TRAIN_FILE
31 |     print "downloading data from", url
32 |     fhandle = opener.open(url)
33 |     np.save(destination, np.loadtxt(opener.open(url), dtype=DTYPE_TRAIN))
34 | 
35 | # download test data
36 | destination = TEST_FILE.rstrip('.dat') + '.npy'
37 | if not os.path.exists(destination):
38 |     url = SDSS_COLORS_URL + TEST_FILE
39 |     print "downloading data from", url
40 |     fhandle = opener.open(url)
41 |     np.save(destination, np.loadtxt(opener.open(url), dtype=DTYPE_TEST))
42 | 
43 | 


--------------------------------------------------------------------------------
/doc/data/sdss_colors/scatter_colors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pylab as pl
 3 | 
 4 | data = np.load('sdssdr6_colors_class_train.npy')
 5 | 
 6 | # only plot 10000 points: otherwise it takes too much memory
 7 | np.random.shuffle(data)
 8 | data = data[:10000]
 9 | 
10 | redshift = data['redshift']
11 | 
12 | print "%i qsos" % np.sum(redshift > 0)
13 | print "%i stars" % np.sum(redshift == 0)
14 | 
15 | kwargs = dict(s=1, c=(redshift > 0), lw=0)
16 | 
17 | pl.figure(figsize=(6, 8))
18 | 
19 | pl.subplot(311).scatter(data['u-g'], data['g-r'], **kwargs)
20 | 
21 | pl.subplot(312).scatter(data['g-r'], data['r-i'], **kwargs)
22 | 
23 | pl.subplot(313).scatter(data['r-i'], data['i-z'], **kwargs)
24 | 
25 | pl.show()
26 | 


--------------------------------------------------------------------------------
/doc/data/sdss_photoz/fetch_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file fetches photometric observations associated with SDSS galaxy
 3 | spectra which have spectroscopically confirmed redshifts.  This directly
 4 | queries the SDSS database for the information, and thus can take a few
 5 | minutes to run.
 6 | """
 7 | 
 8 | import os
 9 | import urllib, urllib2
10 | import numpy as np
11 | 
12 | # Here's how the data can be downloaded directly from the SDSS server.
13 | # This route is limited to N = 50000, so we've done this separately
14 | def fetch_data_sql(N = 50000):
15 |     URL = 'http://cas.sdss.org/public/en/tools/search/x_sql.asp'
16 |     archive_file = 'sdss_galaxy_colors.npy'
17 | 
18 |     dtype = [('mags', '5float32'),
19 |              ('specClass', 'int8'),
20 |              ('z', 'float32'),
21 |              ('zerr', 'float32')]
22 | 
23 |     def sql_query(sql_str, url=URL, format='csv'):
24 |         """Execute SQL query"""
25 |         # remove comments from string
26 |         sql_str = ' \n'.join(map(lambda x: x.split('--')[0],
27 |                                  sql_str.split('\n')))
28 |         params = urllib.urlencode(dict(cmd=sql_str, format=format))
29 |         return urllib.urlopen(url + '?%s' % params)
30 | 
31 |     query_text = ('\n'.join(
32 |             ("SELECT TOP %i" % N,
33 |              "   modelMag_u, modelMag_g, modelMag_r, modelMag_i, modelMag_z, specClass, z, zErr",
34 |              "FROM SpecPhoto",
35 |              "WHERE ",
36 |              "   modelMag_u BETWEEN 0 AND 19.6",
37 |              "   AND modelMag_g BETWEEN 0 AND 20",
38 |              "   AND zerr BETWEEN 0 and 0.03",
39 |              "   AND specClass > 1 -- not UNKNOWN or STAR",
40 |              "   AND specClass <> 5 -- not SKY",
41 |              "   AND specClass <> 6 -- not STAR_LATE")))
42 | 
43 | 
44 |     if not os.path.exists(archive_file):
45 |         print "querying for %i objects" % N
46 |         print query_text
47 |         output = sql_query(query_text)
48 |         print "finished.  Processing & saving data"
49 |         try:
50 |             data = np.loadtxt(output, delimiter=',', skiprows=1, dtype=DTYPE)
51 |         except:
52 |             raise ValueError(output.read())
53 |         np.save(archive_file, data)
54 |     else:
55 |         print "data already on disk"
56 | 
57 | 
58 | DATA_URL = ('http://www.astro.washington.edu/users/'
59 |             'vanderplas/pydata/sdss_photoz.npy')
60 | LOCAL_FILE = 'sdss_photoz.npy'
61 | 
62 | # data directory is password protected so the public can't access it    
63 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
64 | password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
65 | handler = urllib2.HTTPBasicAuthHandler(password_mgr)
66 | opener = urllib2.build_opener(handler)
67 | 
68 | # download training data
69 | if not os.path.exists(LOCAL_FILE):
70 |     print "downloading data from", DATA_URL
71 |     fhandle = opener.open(DATA_URL)
72 |     open(LOCAL_FILE, 'wb').write(fhandle.read())
73 | 


--------------------------------------------------------------------------------
/doc/data/sdss_spectra/fetch_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib2
 3 | import numpy as np
 4 | 
 5 | DATA_URL = ('http://www.astro.washington.edu/users/'
 6 |             'vanderplas/pydata/spec4000_corrected.npz')
 7 | LOCAL_FILE = 'spec4000_corrected.npz'
 8 | 
 9 | # data directory is password protected so the public can't access it    
10 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
11 | password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
12 | handler = urllib2.HTTPBasicAuthHandler(password_mgr)
13 | opener = urllib2.build_opener(handler)
14 | 
15 | # download training data
16 | if not os.path.exists(LOCAL_FILE):
17 |     print "downloading data from", DATA_URL
18 |     fhandle = opener.open(DATA_URL)
19 |     open(LOCAL_FILE, 'wb').write(fhandle.read())
20 | 


--------------------------------------------------------------------------------
/doc/images/blank_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/images/blank_image.png


--------------------------------------------------------------------------------
/doc/includes/big_toc_css.rst:
--------------------------------------------------------------------------------
 1 | ..  
 2 |     File to ..include in a document with a big table of content, to give
 3 |     it 'style'
 4 | 
 5 | .. raw:: html
 6 | 
 7 |   <style type="text/css">
 8 |     div.bodywrapper blockquote {
 9 |         margin: 0 ;
10 |     }
11 | 
12 |     div.toctree-wrapper ul {
13 | 	margin-top: 0 ;
14 | 	margin-bottom: 0 ;
15 | 	padding-left: 10px ;
16 |     }
17 | 
18 |     li.toctree-l1 {
19 |         padding: 0 0 0.5em 0 ;
20 |         list-style-type: none;
21 |         font-size: 150% ;
22 | 	font-weight: bold;
23 |         }
24 | 
25 |     li.toctree-l1 ul {
26 | 	padding-left: 40px ;
27 |     }
28 | 
29 |     li.toctree-l2 {
30 |         font-size: 70% ;
31 |         list-style-type: square;
32 | 	font-weight: normal;
33 |         }
34 | 
35 |     li.toctree-l3 {
36 |         font-size: 85% ;
37 |         list-style-type: circle;
38 | 	font-weight: normal;
39 |         }
40 |  
41 |   </style>
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/doc/includes/bigger_toc_css.rst:
--------------------------------------------------------------------------------
 1 | ..  
 2 |     File to ..include in a document with a very big table of content, to 
 3 |     give it 'style'
 4 | 
 5 | .. raw:: html
 6 | 
 7 |   <style type="text/css">
 8 |     div.bodywrapper blockquote {
 9 |         margin: 0 ;
10 |     }
11 | 
12 |     div.toctree-wrapper ul {
13 | 	margin: 0 ;
14 | 	padding-left: 0px ;
15 |     }
16 | 
17 |     li.toctree-l1 {
18 |         padding: 0 ;
19 |         list-style-type: none;
20 |         font-size: 150% ;
21 | 	font-family: Arial, sans-serif;
22 | 	background-color: #BED4EB;
23 | 	font-weight: normal;
24 | 	color: #212224;
25 | 	margin-left : 0;
26 | 	font-weight: bold;
27 |         }
28 | 
29 |     li.toctree-l1 a {
30 |         padding: 0 0 0 10px ;
31 |     }
32 |  
33 |     li.toctree-l2 {
34 |         padding: 0.25em 0 0.25em 0 ;
35 |         list-style-type: none;
36 | 	background-color: #FFFFFF;
37 |         font-size: 90% ;
38 | 	font-weight: bold;
39 |         }
40 | 
41 |     li.toctree-l2 ul {
42 | 	padding-left: 40px ;
43 |     }
44 | 
45 |     li.toctree-l3 {
46 |         font-size: 70% ;
47 |         list-style-type: none;
48 | 	font-weight: normal;
49 |         }
50 | 
51 |     li.toctree-l4 {
52 |         font-size: 85% ;
53 |         list-style-type: none;
54 | 	font-weight: normal;
55 |         }
56 |  
57 |   </style>
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | ..
  3 |     We are putting the title as a raw HTML so that it doesn't appear in
  4 |     the contents
  5 |     
  6 | .. raw:: html
  7 | 
  8 |     <h1>Tutorial: Machine Learning for Astronomy with Scikit-learn</h1>
  9 |     <style type="text/css">
 10 |     p {
 11 |         margin: 7px 0 7px 0 ;
 12 |     }
 13 |     span.linkdescr a {
 14 |         color:  #3E4349 ;
 15 |     }
 16 |     div.banner img {
 17 |         vertical-align: middle;
 18 |     }
 19 |     </style>
 20 | 
 21 | ..  
 22 |    Here we are building a banner: a javascript selects randomly 4 images in 
 23 |    the list
 24 | 
 25 | .. only:: html
 26 | 
 27 |     .. |banner1| image:: auto_examples/images/plot_sdss_filters_1.png
 28 |        :height: 120
 29 |        :target: auto_examples/plot_sdss_filters.html
 30 | 
 31 |     .. |banner2| image:: auto_examples/images/plot_sdss_filters_2.png
 32 |        :height: 120
 33 |        :target: auto_examples/plot_sdss_filters.html
 34 | 
 35 |     .. |banner3| image:: auto_examples/images/plot_sdss_images_1.png
 36 |        :height: 90
 37 |        :target: auto_examples/plot_sdss_images.html
 38 | 
 39 |     .. |banner4| image:: auto_examples/images/plot_ML_flow_chart_1.png
 40 |        :height: 120
 41 |        :target: auto_examples/plot_ML_flow_chart.html
 42 | 
 43 |     .. |banner5| image:: auto_examples/images/plot_sdss_photoz_1.png
 44 |        :height: 120
 45 |        :target: auto_examples/plot_sdss_photoz.html
 46 | 
 47 |     .. |banner6| image:: auto_examples/images/plot_sdss_specPCA_1.png
 48 |        :height: 120
 49 |        :target: auto_examples/plot_sdss_specPCA.html
 50 | 
 51 |     .. |center-div| raw:: html
 52 | 
 53 |         <div style="text-align: center; vertical-align: middle; margin: -7px 0 -10px 0;" id="banner" class="banner">
 54 | 
 55 |     .. |end-div| raw:: html
 56 | 
 57 |         </div>
 58 | 
 59 |         <SCRIPT>
 60 |         // Function to select 4 imgs in random order from a div
 61 |         function shuffle(e) {       // pass the divs to the function
 62 |           var replace = $('<div>');
 63 |           var size = 4;
 64 |           var num_choices = e.size();
 65 | 
 66 |           while (size >= 1 && num_choices >= 1) {
 67 |             var rand = Math.floor(Math.random() * num_choices);
 68 |             var temp = e.get(rand);      // grab a random div from our set
 69 |             replace.append(temp);        // add the selected div to our new set
 70 |             e = e.not(temp); // remove our selected div from the main set
 71 |             size--;
 72 |             num_choices--;
 73 |           }
 74 |           $('#banner').html(replace.html() ); // update our container div 
 75 |                                               // with the new, randomized divs
 76 |         }
 77 |         shuffle ($('#banner a.external'));
 78 |         </SCRIPT>
 79 | 
 80 |     |center-div| |banner1| |banner2| |banner3| |banner4| |banner5| |banner6| |end-div|
 81 | 
 82 | .. only:: html
 83 | 
 84 | .. only:: html
 85 | 
 86 |  .. sidebar:: Download 
 87 |     
 88 |     * Source code: `github <https://github.com/astroML/sklearn_tutorial>`_
 89 | 
 90 |     * PDF of tutorial: :download:`sklearn_tutorial.pdf`
 91 |         
 92 |     * Tarball of exercises and notebooks: :download:`exercises.tgz`
 93 | 
 94 | 
 95 | .. sectionauthor:: Jake Vanderplas <vanderplas@astro.washington.edu>
 96 | 
 97 | 
 98 | .. topic:: AstroML
 99 |    
100 |    For more information on machine learning for Astronomy, see the
101 |    `astroML <http://astroML.github.com>`_ code and examples.
102 | 
103 | .. topic:: Machine Learning for Astronomy with scikit-learn
104 | 
105 |    This tutorial offers a brief introduction to the fields of machine
106 |    learning and statistical data analysis, and their application to
107 |    several problems in the field of astronomy.  These learning tasks
108 |    are enabled by the tools available in the open-source package
109 |    `scikit-learn`_.
110 | 
111 |    `scikit-learn`_ is a Python module integrating classic machine
112 |    learning algorithms in the tightly-knit world of scientific Python
113 |    packages (`numpy`_, `scipy`_, `matplotlib`_).  It aims to provide
114 |    simple and efficient solutions to learning problems that are accessible
115 |    to everybody and reusable in various contexts:
116 |    **machine-learning as a versatile tool for science and engineering**.
117 | 
118 |    Many of the examples and exercises in this tutorial require the
119 |    `ipython notebook`_, a tool which provides an intuitive web-based
120 |    interactive environment for scientific python.  Some of the material
121 |    in the notebooks is duplicated in the following pages, but ipython
122 |    notebook is required for some parts. For information on how to download
123 |    the associated notebooks, see the :ref:`sklearn_tutorial_setup` page.
124 | 
125 | .. _`scikit-learn`: http://www.scikit-learn.org
126 | .. _`numpy`: http://numpy.scipy.org
127 | .. _`scipy`: http://www.scipy.org
128 | .. _`matplotlib`: http://matplotlib.sourceforge.net
129 | .. _`ipython notebook`: http://ipython.org/ipython-doc/stable/interactive/htmlnotebook.html
130 | 
131 | .. include:: includes/big_toc_css.rst
132 | 
133 | .. note:: This document is meant to be used with **scikit-learn version
134 |    0.11+**.  Find the latest version `here <scikit-learn>`_.
135 | 
136 | .. toctree::
137 |    :numbered:
138 |    :maxdepth: 2
139 | 
140 |    setup
141 |    general_concepts
142 |    practical
143 |    classification
144 |    regression
145 |    dimensionality_reduction
146 |    exercises
147 |    auto_examples/index
148 | 
149 | .. toctree::
150 |    :hidden:
151 | 
152 |    AUTHORS
153 | 
154 | ..  
155 |  FIXME: I need the link below to make sure the banner gets copied to the
156 |  target directory.
157 | 
158 | 
159 | .. only:: html
160 | 
161 |  .. raw:: html
162 |  
163 |    <div style='visibility: hidden ; height=0'>
164 | 
165 |  .. raw:: html
166 |  
167 |    </div>
168 | 
169 | 


--------------------------------------------------------------------------------
/doc/logos/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/favicon.ico


--------------------------------------------------------------------------------
/doc/logos/identity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/identity.pdf


--------------------------------------------------------------------------------
/doc/logos/scikit-learn-logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo-small.png


--------------------------------------------------------------------------------
/doc/logos/scikit-learn-logo-thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo-thumb.png


--------------------------------------------------------------------------------
/doc/logos/scikit-learn-logo.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo.bmp


--------------------------------------------------------------------------------
/doc/logos/scikit-learn-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo.png


--------------------------------------------------------------------------------
/doc/logos/scikit-learn-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 14.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 43363)  -->
 3 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 5 | 	 width="792px" height="612px" viewBox="0 0 792 612" enable-background="new 0 0 792 612" xml:space="preserve">
 6 | <g>
 7 | 	<path fill="#F89939" d="M333.32,347.348c33.869-33.867,39.498-83.146,12.572-110.07c-26.922-26.921-76.199-21.293-110.066,12.572
 8 | 		c-33.867,33.866-24.07,98.568-12.57,110.07C232.549,369.213,299.455,381.213,333.32,347.348z"/>
 9 | 	<path fill="#3499CD" d="M194.35,298.411c-19.648-19.648-48.242-22.919-63.867-7.295c-15.621,15.622-12.355,44.22,7.297,63.865
10 | 		c19.652,19.654,57.195,13.969,63.863,7.295C207.039,356.889,214.004,318.061,194.35,298.411z"/>
11 | </g>
12 | <g>
13 | 	<g>
14 | 		<path fill="#010101" d="M262.143,339.047c-3.471,3.195-6.516,5.553-9.133,7.068c-2.617,1.52-5.113,2.279-7.488,2.279
15 | 			c-2.732,0-4.936-1.059-6.607-3.178c-1.674-2.121-2.508-4.965-2.508-8.543c0-5.361,1.162-11.797,3.486-19.301
16 | 			c2.32-7.51,5.145-14.43,8.463-20.761l9.729-3.602c0.305-0.102,0.537-0.154,0.691-0.154c0.738,0,1.348,0.544,1.816,1.627
17 | 			c0.473,1.088,0.711,2.55,0.711,4.388c0,5.209-1.199,10.252-3.602,15.129c-2.402,4.879-6.154,10.086-11.26,15.627
18 | 			c-0.205,2.656-0.307,4.48-0.307,5.477c0,2.223,0.408,3.982,1.225,5.285c0.818,1.305,1.902,1.953,3.256,1.953
19 | 			c1.381,0,2.848-0.494,4.406-1.49c1.555-0.998,3.93-3.064,7.121-6.207V339.047z M247.475,324.074
20 | 			c3.242-3.605,5.875-7.648,7.891-12.121c2.016-4.475,3.023-8.324,3.023-11.549c0-0.94-0.139-1.704-0.418-2.278
21 | 			c-0.281-0.575-0.641-0.864-1.074-0.864c-0.941,0-2.316,2.352-4.117,7.057C250.979,309.023,249.211,315.609,247.475,324.074z"/>
22 | 		<path fill="#010101" d="M290.795,339.047c-3.242,3.195-6.152,5.553-8.732,7.068c-2.58,1.52-5.424,2.279-8.541,2.279
23 | 			c-3.473,0-6.275-1.111-8.41-3.33c-2.131-2.225-3.195-5.146-3.195-8.773c0-5.412,1.875-10.309,5.633-14.688
24 | 			c3.75-4.381,7.914-6.57,12.484-6.57c2.375,0,4.275,0.615,5.707,1.84c1.43,1.227,2.145,2.834,2.145,4.826
25 | 			c0,5.287-5.617,9.574-16.852,12.869c1.02,4.977,3.688,7.469,8.004,7.469c1.686,0,3.293-0.453,4.824-1.357
26 | 			c1.535-0.908,3.844-2.922,6.934-6.035V339.047z M270.725,331.963c6.535-1.84,9.805-5.234,9.805-10.188
27 | 			c0-2.451-0.895-3.676-2.68-3.676c-1.686,0-3.293,1.281-4.824,3.85C271.49,324.514,270.725,327.85,270.725,331.963z"/>
28 | 		<path fill="#010101" d="M331.701,339.047c-4.086,3.881-7.01,6.412-8.77,7.588c-1.762,1.174-3.447,1.76-5.057,1.76
29 | 			c-4.035,0-5.936-3.561-5.707-10.686c-2.553,3.65-4.91,6.344-7.068,8.084c-2.156,1.736-4.383,2.602-6.684,2.602
30 | 			c-2.244,0-4.152-1.051-5.725-3.158s-2.354-4.691-2.354-7.758c0-3.828,1.051-7.48,3.156-10.955
31 | 			c2.109-3.473,4.809-6.279,8.102-8.424s6.207-3.219,8.732-3.219c3.193,0,5.428,1.469,6.705,4.404l7.828-4.326h2.148l-3.381,11.221
32 | 			c-1.736,5.645-2.607,9.514-2.607,11.607c0,2.195,0.777,3.293,2.336,3.293c0.992,0,2.09-0.529,3.291-1.59s2.883-2.676,5.053-4.846
33 | 			V339.047z M303.664,341.156c2.553,0,4.959-2.176,7.223-6.529c2.26-4.355,3.389-8.373,3.389-12.049c0-1.428-0.322-2.547-0.957-3.35
34 | 			c-0.641-0.807-1.496-1.207-2.566-1.207c-2.555,0-4.977,2.17-7.258,6.512c-2.285,4.342-3.43,8.338-3.43,11.986
35 | 			c0,1.381,0.34,2.498,1.016,3.354S302.615,341.156,303.664,341.156z"/>
36 | 		<path fill="#010101" d="M360.314,339.047c-6.41,6.281-11.352,9.424-14.824,9.424c-1.559,0-2.875-0.658-3.945-1.969
37 | 			c-1.07-1.316-1.609-2.945-1.609-4.887c0-3.6,1.93-8.424,5.785-14.477c-1.891,0.971-3.957,1.645-6.205,2.029
38 | 			c-1.66,3.064-4.266,6.359-7.814,9.879h-0.879v-3.443c1.99-2.068,3.791-4.291,5.4-6.666c-2.199-0.971-3.295-2.414-3.295-4.326
39 | 			c0-1.969,0.668-4.068,2.012-6.305c1.34-2.232,3.184-3.348,5.535-3.348c1.992,0,2.986,1.018,2.986,3.062
40 | 			c0,1.609-0.574,3.906-1.725,6.895c4.238-0.461,7.941-3.701,11.109-9.729l3.484-0.154l-3.562,9.805
41 | 			c-1.48,4.137-2.438,6.955-2.871,8.447s-0.652,2.816-0.652,3.963c0,1.074,0.25,1.932,0.746,2.566
42 | 			c0.498,0.643,1.17,0.959,2.012,0.959c0.918,0,1.801-0.314,2.643-0.936c0.842-0.631,2.732-2.359,5.67-5.193V339.047z"/>
43 | 		<path fill="#010101" d="M397.928,339.047c-5.898,6.234-10.957,9.348-15.168,9.348c-1.711,0-3.09-0.6-4.137-1.801
44 | 			c-1.049-1.199-1.572-2.807-1.572-4.824c0-2.732,1.125-6.908,3.373-12.523c1.199-3.014,1.801-4.932,1.801-5.746
45 | 			c0-0.818-0.322-1.227-0.957-1.227c-0.357,0-0.832,0.18-1.418,0.535c-0.539,0.357-1.164,0.859-1.879,1.496
46 | 			c-0.637,0.586-1.354,1.301-2.145,2.141c-0.691,0.721-1.432,1.537-2.219,2.453l-2.148,2.492c-0.943,1.148-1.531,2.359-1.76,3.637
47 | 			c-0.385,2.17-0.639,4.164-0.768,5.979c-0.078,1.35-0.115,3.174-0.115,5.477l-8.465,1.988c-0.279-3.447-0.422-6.014-0.422-7.697
48 | 			c0-4.111,0.479-8.006,1.438-11.682c0.957-3.68,2.494-7.814,4.615-12.412l9.344-1.799c-1.965,5.287-3.254,9.447-3.867,12.484
49 | 			c4.188-4.672,7.508-7.906,9.969-9.709c2.457-1.801,4.645-2.697,6.557-2.697c1.299,0,2.385,0.49,3.25,1.471
50 | 			c0.869,0.982,1.301,2.215,1.301,3.689c0,2.449-1.098,6.484-3.291,12.104c-1.508,3.854-2.262,6.355-2.262,7.51
51 | 			c0,1.537,0.627,2.305,1.881,2.305c1.867,0,4.891-2.465,9.064-7.393V339.047z"/>
52 | 	</g>
53 | </g>
54 | <rect x="273.943" y="285.613" fill="none" width="225.242" height="83.948"/>
55 | <text transform="matrix(1 0 0 1 273.9414 302.2061)" fill="#FFFFFF" font-family="'Helvetica'" font-size="23.0795">scikits</text>
56 | <rect x="345.568" y="357.979" fill="none" width="327.432" height="42.963"/>
57 | <text transform="matrix(1 0 0 1 345.5684 375.5176)" fill="#010101" font-family="'Verdana'" font-size="23.0795">machine learning in Python</text>
58 | </svg>
59 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | set SPHINXBUILD=sphinx-build
  6 | set BUILDDIR=_build
  7 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
  8 | if NOT "%PAPER%" == "" (
  9 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 10 | )
 11 | 
 12 | if "%1" == "" goto help
 13 | 
 14 | if "%1" == "help" (
 15 | 	:help
 16 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 17 | 	echo.  html      to make standalone HTML files
 18 | 	echo.  dirhtml   to make HTML files named index.html in directories
 19 | 	echo.  pickle    to make pickle files
 20 | 	echo.  json      to make JSON files
 21 | 	echo.  htmlhelp  to make HTML files and a HTML help project
 22 | 	echo.  qthelp    to make HTML files and a qthelp project
 23 | 	echo.  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 24 | 	echo.  changes   to make an overview over all changed/added/deprecated items
 25 | 	echo.  linkcheck to check all external links for integrity
 26 | 	echo.  doctest   to run all doctests embedded in the documentation if enabled
 27 | 	goto end
 28 | )
 29 | 
 30 | if "%1" == "clean" (
 31 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 32 | 	del /q /s %BUILDDIR%\*
 33 | 	goto end
 34 | )
 35 | 
 36 | if "%1" == "html" (
 37 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 38 | 	echo.
 39 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "dirhtml" (
 44 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 45 | 	echo.
 46 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 47 | 	goto end
 48 | )
 49 | 
 50 | if "%1" == "pickle" (
 51 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 52 | 	echo.
 53 | 	echo.Build finished; now you can process the pickle files.
 54 | 	goto end
 55 | )
 56 | 
 57 | if "%1" == "json" (
 58 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 59 | 	echo.
 60 | 	echo.Build finished; now you can process the JSON files.
 61 | 	goto end
 62 | )
 63 | 
 64 | if "%1" == "htmlhelp" (
 65 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 66 | 	echo.
 67 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 68 | .hhp project file in %BUILDDIR%/htmlhelp.
 69 | 	goto end
 70 | )
 71 | 
 72 | if "%1" == "qthelp" (
 73 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 74 | 	echo.
 75 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 76 | .qhcp project file in %BUILDDIR%/qthelp, like this:
 77 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\nisl.qhcp
 78 | 	echo.To view the help file:
 79 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\nisl.ghc
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "latex" (
 84 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
 85 | 	echo.
 86 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
 87 | 	goto end
 88 | )
 89 | 
 90 | if "%1" == "changes" (
 91 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
 92 | 	echo.
 93 | 	echo.The overview file is in %BUILDDIR%/changes.
 94 | 	goto end
 95 | )
 96 | 
 97 | if "%1" == "linkcheck" (
 98 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
 99 | 	echo.
100 | 	echo.Link check complete; look for any errors in the above output ^
101 | or in %BUILDDIR%/linkcheck/output.txt.
102 | 	goto end
103 | )
104 | 
105 | if "%1" == "doctest" (
106 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
107 | 	echo.
108 | 	echo.Testing of doctests in the sources finished, look at the ^
109 | results in %BUILDDIR%/doctest/output.txt.
110 | 	goto end
111 | )
112 | 
113 | :end
114 | 


--------------------------------------------------------------------------------
/doc/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | *.v2.ipynb


--------------------------------------------------------------------------------
/doc/notebooks/01_datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "01_datasets"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Loading Datasets with scikit-learn"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "For the intro portion of this tutorial, we'll be loading several dataset examples.  Scikit-learn has methods to access several datasets: we'll explore two of these here."
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "heading",
 27 |      "level": 2,
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "Loading Iris Data"
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "The machine learning community often uses a simple flowers database where each row in the database (or CSV file) is a set of measurements of an individual iris flower. Each sample in this dataset is described by 4 features and can belong to one of the target classes:\n",
 38 |       "\n",
 39 |       "- Features in the Iris dataset:\n",
 40 |       "\n",
 41 |       "  1. sepal length in cm\n",
 42 |       "  2. sepal width in cm\n",
 43 |       "  3. petal length in cm\n",
 44 |       "  4. petal width in cm\n",
 45 |       "\n",
 46 |       "- Target classes to predict:\n",
 47 |       "\n",
 48 |       "  1. Iris Setosa\n",
 49 |       "  2. Iris Versicolour\n",
 50 |       "  3. Iris Virginica"
 51 |      ]
 52 |     },
 53 |     {
 54 |      "cell_type": "markdown",
 55 |      "metadata": {},
 56 |      "source": [
 57 |       "``scikit-learn`` embeds a copy of the iris CSV file along with a helper function to load it into numpy arrays:"
 58 |      ]
 59 |     },
 60 |     {
 61 |      "cell_type": "code",
 62 |      "collapsed": false,
 63 |      "input": [
 64 |       "from sklearn.datasets import load_iris\n",
 65 |       "iris = load_iris()"
 66 |      ],
 67 |      "language": "python",
 68 |      "metadata": {},
 69 |      "outputs": []
 70 |     },
 71 |     {
 72 |      "cell_type": "markdown",
 73 |      "metadata": {},
 74 |      "source": [
 75 |       "The features of each sample flower are stored in the ``data`` attribute of the dataset:"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "cell_type": "code",
 80 |      "collapsed": false,
 81 |      "input": [
 82 |       "n_samples, n_features = iris.data.shape\n",
 83 |       "print n_samples\n",
 84 |       "print n_features\n",
 85 |       "print iris.data[0]"
 86 |      ],
 87 |      "language": "python",
 88 |      "metadata": {},
 89 |      "outputs": []
 90 |     },
 91 |     {
 92 |      "cell_type": "markdown",
 93 |      "metadata": {},
 94 |      "source": [
 95 |       "The information about the class of each sample is stored in the ``target`` attribute of the dataset:"
 96 |      ]
 97 |     },
 98 |     {
 99 |      "cell_type": "code",
100 |      "collapsed": false,
101 |      "input": [
102 |       "len(iris.data) == len(iris.target)"
103 |      ],
104 |      "language": "python",
105 |      "metadata": {},
106 |      "outputs": []
107 |     },
108 |     {
109 |      "cell_type": "code",
110 |      "collapsed": false,
111 |      "input": [
112 |       "iris.target"
113 |      ],
114 |      "language": "python",
115 |      "metadata": {},
116 |      "outputs": []
117 |     },
118 |     {
119 |      "cell_type": "markdown",
120 |      "metadata": {},
121 |      "source": [
122 |       "The names of the classes are stored in the last attribute, namely ``target_names``:"
123 |      ]
124 |     },
125 |     {
126 |      "cell_type": "code",
127 |      "collapsed": false,
128 |      "input": [
129 |       "list(iris.target_names)"
130 |      ],
131 |      "language": "python",
132 |      "metadata": {},
133 |      "outputs": []
134 |     },
135 |     {
136 |      "cell_type": "markdown",
137 |      "metadata": {},
138 |      "source": [
139 |       "The data downloaded from the iris dataset is stored locally, within a subdirectory of your home directory.  You can use the following to determine where it is:"
140 |      ]
141 |     },
142 |     {
143 |      "cell_type": "code",
144 |      "collapsed": false,
145 |      "input": [
146 |       "from sklearn.datasets import get_data_home\n",
147 |       "get_data_home()"
148 |      ],
149 |      "language": "python",
150 |      "metadata": {},
151 |      "outputs": []
152 |     },
153 |     {
154 |      "cell_type": "markdown",
155 |      "metadata": {},
156 |      "source": [
157 |       "Take a moment now to examine this directory and see that the iris data is stored there.  You may also be curious about other datasets which are available.  These can be found in ``sklearn.datasets``."
158 |      ]
159 |     },
160 |     {
161 |      "cell_type": "code",
162 |      "collapsed": false,
163 |      "input": [
164 |       "from sklearn import datasets"
165 |      ],
166 |      "language": "python",
167 |      "metadata": {},
168 |      "outputs": []
169 |     },
170 |     {
171 |      "cell_type": "markdown",
172 |      "metadata": {},
173 |      "source": [
174 |       "You can see which datasets are available by using ipython's tab-completion feature.  Simply type\n",
175 |       "\n",
176 |       "   ``datasets.fetch_``\n",
177 |       "\n",
178 |       "or\n",
179 |       "\n",
180 |       "   ``datasets.load_``\n",
181 |       "\n",
182 |       "and then press the tab key.  This will give you a drop-down menu which lists all the datasets that can be fetched."
183 |      ]
184 |     },
185 |     {
186 |      "cell_type": "code",
187 |      "collapsed": false,
188 |      "input": [],
189 |      "language": "python",
190 |      "metadata": {},
191 |      "outputs": []
192 |     },
193 |     {
194 |      "cell_type": "markdown",
195 |      "metadata": {},
196 |      "source": [
197 |       "Be warned: many of these datasets are quite large!  If you start a download and you want to kill it, you can use ipython's \"kernel interrupt\" feature, available in the menu or using the shortcut ``Ctrl-m i``.\n",
198 |       "\n",
199 |       "(You can press ``Ctrl-m h`` for a list of all ``ipython`` keyboard shortcuts)."
200 |      ]
201 |     },
202 |     {
203 |      "cell_type": "heading",
204 |      "level": 2,
205 |      "metadata": {},
206 |      "source": [
207 |       "Loading Digits Data"
208 |      ]
209 |     },
210 |     {
211 |      "cell_type": "markdown",
212 |      "metadata": {},
213 |      "source": [
214 |       "Now we'll take a look at another dataset, one where we have to put a bit more thought into how to represent the data."
215 |      ]
216 |     },
217 |     {
218 |      "cell_type": "code",
219 |      "collapsed": false,
220 |      "input": [
221 |       "from sklearn.datasets import load_digits\n",
222 |       "digits = load_digits()\n",
223 |       "\n",
224 |       "n_samples, n_features = digits.data.shape\n",
225 |       "print (n_samples, n_features)"
226 |      ],
227 |      "language": "python",
228 |      "metadata": {},
229 |      "outputs": []
230 |     },
231 |     {
232 |      "cell_type": "markdown",
233 |      "metadata": {},
234 |      "source": [
235 |       "Let's take a look at the data.  As with the iris data, we can access the information as follows:"
236 |      ]
237 |     },
238 |     {
239 |      "cell_type": "code",
240 |      "collapsed": false,
241 |      "input": [
242 |       "print digits.data[0]\n",
243 |       "print digits.target"
244 |      ],
245 |      "language": "python",
246 |      "metadata": {},
247 |      "outputs": []
248 |     },
249 |     {
250 |      "cell_type": "markdown",
251 |      "metadata": {},
252 |      "source": [
253 |       "Each sample has 64 features, representing a hand-written digit.  We can plot the images these features represent to gain more insight.\n",
254 |       "\n",
255 |       "We want to plot figures using pylab: we'll use the following command to make sure the figures appear in-line (this only works within ipython notebook):\n"
256 |      ]
257 |     },
258 |     {
259 |      "cell_type": "code",
260 |      "collapsed": false,
261 |      "input": [
262 |       "%pylab inline"
263 |      ],
264 |      "language": "python",
265 |      "metadata": {},
266 |      "outputs": []
267 |     },
268 |     {
269 |      "cell_type": "markdown",
270 |      "metadata": {},
271 |      "source": [
272 |       "We can access the digits data in the same way as the iris data above.  Let's plot a sample of the digits"
273 |      ]
274 |     },
275 |     {
276 |      "cell_type": "code",
277 |      "collapsed": false,
278 |      "input": [
279 |       "import pylab as pl\n",
280 |       "\n",
281 |       "# set up the figure\n",
282 |       "fig = pl.figure(figsize=(8, 8))  # figure size in inches\n",
283 |       "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n",
284 |       "\n",
285 |       "# plot the digits: each image is 8x8 pixels\n",
286 |       "for i in range(100):\n",
287 |       "    ax = fig.add_subplot(10, 10, i + 1, xticks=[], yticks=[])\n",
288 |       "    ax.imshow(digits.data[i].reshape((8, 8)), cmap=pl.cm.binary)\n",
289 |       "    \n",
290 |       "    # label the image with the target value\n",
291 |       "    ax.text(0, 7, str(digits.target[i]), bbox=dict(facecolor='white', edgecolor='none', pad=1))"
292 |      ],
293 |      "language": "python",
294 |      "metadata": {},
295 |      "outputs": []
296 |     },
297 |     {
298 |      "cell_type": "markdown",
299 |      "metadata": {},
300 |      "source": [
301 |       "Notice that we are representing each two-dimensional array of pixels as a single vector.  This **data representation** is a very important aspect of machine learning.  All of the algorithms in scikit-learn accept data in a matrix format, of size ``[n_samples`` $\\times$ ``n_features]``.\n",
302 |       "\n",
303 |       "With the digits data, we saw above that ``n_samples = 1797``, and ``n_features = 64``: one integer-valued feature for each pixel."
304 |      ]
305 |     }
306 |    ],
307 |    "metadata": {}
308 |   }
309 |  ]
310 | }


--------------------------------------------------------------------------------
/doc/notebooks/03_iris_dimensionality.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "03_iris_dimensionality"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Dimensionality Reduction and Visualization"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Dimensionality reduction is the task of deriving a set of new\n",
 23 |       "artificial features that is smaller than the original feature\n",
 24 |       "set while retaining most of the variance of the original data.\n",
 25 |       "Here we'll use a common but powerful dimensionality reduction\n",
 26 |       "technique called Principal Component Analysis (PCA).\n",
 27 |       "We'll perform PCA on the iris dataset that we saw before:"
 28 |      ]
 29 |     },
 30 |     {
 31 |      "cell_type": "code",
 32 |      "collapsed": false,
 33 |      "input": [
 34 |       "from sklearn.datasets import load_iris\n",
 35 |       "iris = load_iris()\n",
 36 |       "X = iris.data\n",
 37 |       "y = iris.target"
 38 |      ],
 39 |      "language": "python",
 40 |      "metadata": {},
 41 |      "outputs": []
 42 |     },
 43 |     {
 44 |      "cell_type": "markdown",
 45 |      "metadata": {},
 46 |      "source": [
 47 |       "PCA is performed using linear combinations of the original features\n",
 48 |       "using a truncated Singular Value Decomposition of the matrix X so\n",
 49 |       "as to project the data onto a base of the top singular vectors.\n",
 50 |       "If the number of retained components is 2 or 3, PCA can be used\n",
 51 |       "to visualize the dataset:"
 52 |      ]
 53 |     },
 54 |     {
 55 |      "cell_type": "code",
 56 |      "collapsed": false,
 57 |      "input": [
 58 |       "from sklearn.decomposition import PCA\n",
 59 |       "pca = PCA(n_components=2, whiten=True).fit(X)"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "markdown",
 67 |      "metadata": {},
 68 |      "source": [
 69 |       "Once fitted, the pca model exposes the singular vectors in the components_ attribute:"
 70 |      ]
 71 |     },
 72 |     {
 73 |      "cell_type": "code",
 74 |      "collapsed": false,
 75 |      "input": [
 76 |       "pca.components_                           "
 77 |      ],
 78 |      "language": "python",
 79 |      "metadata": {},
 80 |      "outputs": []
 81 |     },
 82 |     {
 83 |      "cell_type": "code",
 84 |      "collapsed": false,
 85 |      "input": [
 86 |       "pca.explained_variance_ratio_"
 87 |      ],
 88 |      "language": "python",
 89 |      "metadata": {},
 90 |      "outputs": []
 91 |     },
 92 |     {
 93 |      "cell_type": "code",
 94 |      "collapsed": false,
 95 |      "input": [
 96 |       "pca.explained_variance_ratio_.sum()"
 97 |      ],
 98 |      "language": "python",
 99 |      "metadata": {},
100 |      "outputs": []
101 |     },
102 |     {
103 |      "cell_type": "markdown",
104 |      "metadata": {},
105 |      "source": [
106 |       "Let us project the iris dataset along those first two dimensions:"
107 |      ]
108 |     },
109 |     {
110 |      "cell_type": "code",
111 |      "collapsed": false,
112 |      "input": [
113 |       "X_pca = pca.transform(X)"
114 |      ],
115 |      "language": "python",
116 |      "metadata": {},
117 |      "outputs": []
118 |     },
119 |     {
120 |      "cell_type": "markdown",
121 |      "metadata": {},
122 |      "source": [
123 |       "The dataset has been \u201cnormalized\u201d, which means that the data\n",
124 |       "is now centered on both components with unit variance:"
125 |      ]
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "import numpy as np\n",
132 |       "np.round(X_pca.mean(axis=0), decimals=5)"
133 |      ],
134 |      "language": "python",
135 |      "metadata": {},
136 |      "outputs": []
137 |     },
138 |     {
139 |      "cell_type": "code",
140 |      "collapsed": false,
141 |      "input": [
142 |       "np.round(X_pca.std(axis=0), decimals=5)"
143 |      ],
144 |      "language": "python",
145 |      "metadata": {},
146 |      "outputs": []
147 |     },
148 |     {
149 |      "cell_type": "markdown",
150 |      "metadata": {},
151 |      "source": [
152 |       "Furthermore the samples components do no longer carry any linear correlation:"
153 |      ]
154 |     },
155 |     {
156 |      "cell_type": "code",
157 |      "collapsed": false,
158 |      "input": [
159 |       "np.round(np.corrcoef(X_pca.T), decimals=5)"
160 |      ],
161 |      "language": "python",
162 |      "metadata": {},
163 |      "outputs": []
164 |     },
165 |     {
166 |      "cell_type": "markdown",
167 |      "metadata": {},
168 |      "source": [
169 |       "We can visualize the projection using pylab, but first\n",
170 |       "let's make sure our ipython notebook is in pylab inline mode"
171 |      ]
172 |     },
173 |     {
174 |      "cell_type": "code",
175 |      "collapsed": false,
176 |      "input": [
177 |       "%pylab inline"
178 |      ],
179 |      "language": "python",
180 |      "metadata": {},
181 |      "outputs": []
182 |     },
183 |     {
184 |      "cell_type": "markdown",
185 |      "metadata": {},
186 |      "source": [
187 |       "Now we can visualize the results using the following utility function:"
188 |      ]
189 |     },
190 |     {
191 |      "cell_type": "code",
192 |      "collapsed": false,
193 |      "input": [
194 |       "import pylab as pl\n",
195 |       "from itertools import cycle\n",
196 |       "\n",
197 |       "def plot_PCA_2D(data, target, target_names):\n",
198 |       "    colors = cycle('rgbcmykw')\n",
199 |       "    target_ids = range(len(target_names))\n",
200 |       "    pl.figure()\n",
201 |       "    for i, c, label in zip(target_ids, colors, target_names):\n",
202 |       "        pl.scatter(data[target == i, 0], data[target == i, 1],\n",
203 |       "                   c=c, label=label)\n",
204 |       "    pl.legend()"
205 |      ],
206 |      "language": "python",
207 |      "metadata": {},
208 |      "outputs": []
209 |     },
210 |     {
211 |      "cell_type": "markdown",
212 |      "metadata": {},
213 |      "source": [
214 |       "Now calling this function for our data, we see the plot:"
215 |      ]
216 |     },
217 |     {
218 |      "cell_type": "code",
219 |      "collapsed": false,
220 |      "input": [
221 |       "plot_PCA_2D(X_pca, iris.target, iris.target_names)"
222 |      ],
223 |      "language": "python",
224 |      "metadata": {},
225 |      "outputs": []
226 |     },
227 |     {
228 |      "cell_type": "markdown",
229 |      "metadata": {},
230 |      "source": [
231 |       "Note that this projection was determined *without* any information about the\n",
232 |       "labels (represented by the colors): this is the sense in which the learning\n",
233 |       "is unsupervised.  Nevertheless, we see that the projection gives us insight\n",
234 |       "into the distribution of the different flowers in parameter space: notably,\n",
235 |       "*iris setosa* is much more distinct than the other two species."
236 |      ]
237 |     },
238 |     {
239 |      "cell_type": "markdown",
240 |      "metadata": {},
241 |      "source": [
242 |       "Note also that the default implementation of PCA computes the SVD of the full\n",
243 |       "data matrix, which is not scalable when both ``n_samples`` and\n",
244 |       "``n_features`` are big (more that a few thousands).\n",
245 |       "If you are interested in a number of components that is much\n",
246 |       "smaller than both ``n_samples`` and ``n_features``, consider using\n",
247 |       ":class:`sklearn.decomposition.RandomizedPCA` instead."
248 |      ]
249 |     },
250 |     {
251 |      "cell_type": "heading",
252 |      "level": 3,
253 |      "metadata": {},
254 |      "source": [
255 |       "Exercise:"
256 |      ]
257 |     },
258 |     {
259 |      "cell_type": "markdown",
260 |      "metadata": {},
261 |      "source": [
262 |       "Repeat the above dimensionality reduction with\n",
263 |       "``sklearn.decomposition.RandomizedPCA``.\n",
264 |       "\n",
265 |       "You can re-use the ``plot_PCA_2D`` function from above.\n",
266 |       "Are the results similar to those from standard PCA?"
267 |      ]
268 |     },
269 |     {
270 |      "cell_type": "code",
271 |      "collapsed": false,
272 |      "input": [
273 |       "from sklearn.decomposition import RandomizedPCA\n",
274 |       "#apply randomized PCA to the iris data as above, and plot the result."
275 |      ],
276 |      "language": "python",
277 |      "metadata": {},
278 |      "outputs": []
279 |     }
280 |    ],
281 |    "metadata": {}
282 |   }
283 |  ]
284 | }


--------------------------------------------------------------------------------
/doc/notebooks/04_iris_clustering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "04_iris_clustering"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Clustering of Iris Data"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Clustering is the task of gathering samples into groups of similar\n",
 23 |       "samples according to some predefined similarity or dissimilarity\n",
 24 |       "measure (such as the Euclidean distance).\n",
 25 |       "\n",
 26 |       "Let's re-use the results of the 2D PCA of the iris dataset in order to\n",
 27 |       "explore clustering.  First we need to repeat some of the code from the\n",
 28 |       "previous notebook"
 29 |      ]
 30 |     },
 31 |     {
 32 |      "cell_type": "code",
 33 |      "collapsed": false,
 34 |      "input": [
 35 |       "# make sure ipython inline mode is activated\n",
 36 |       "%pylab inline"
 37 |      ],
 38 |      "language": "python",
 39 |      "metadata": {},
 40 |      "outputs": []
 41 |     },
 42 |     {
 43 |      "cell_type": "code",
 44 |      "collapsed": false,
 45 |      "input": [
 46 |       "# all of this is taken from the notebook '03_iris_dimensionality.ipynb' \n",
 47 |       "from sklearn.datasets import load_iris\n",
 48 |       "from sklearn.decomposition import PCA\n",
 49 |       "import pylab as pl\n",
 50 |       "from itertools import cycle\n",
 51 |       "\n",
 52 |       "iris = load_iris()\n",
 53 |       "X = iris.data\n",
 54 |       "y = iris.target\n",
 55 |       "\n",
 56 |       "pca = PCA(n_components=2, whiten=True).fit(X)\n",
 57 |       "X_pca = pca.transform(X)\n",
 58 |       "\n",
 59 |       "def plot_2D(data, target, target_names):\n",
 60 |       "    colors = cycle('rgbcmykw')\n",
 61 |       "    target_ids = range(len(target_names))\n",
 62 |       "    pl.figure()\n",
 63 |       "    for i, c, label in zip(target_ids, colors, target_names):\n",
 64 |       "        pl.scatter(data[target == i, 0], data[target == i, 1],\n",
 65 |       "                   c=c, label=label)\n",
 66 |       "    pl.legend()"
 67 |      ],
 68 |      "language": "python",
 69 |      "metadata": {},
 70 |      "outputs": []
 71 |     },
 72 |     {
 73 |      "cell_type": "markdown",
 74 |      "metadata": {},
 75 |      "source": [
 76 |       "Now we will use one of the simplest clustering algorithms, K-means.\n",
 77 |       "This is an iterative algorithm which searches for three cluster\n",
 78 |       "centers such that the distance from each point to its cluster is\n",
 79 |       "minimizied."
 80 |      ]
 81 |     },
 82 |     {
 83 |      "cell_type": "code",
 84 |      "collapsed": false,
 85 |      "input": [
 86 |       "from sklearn.cluster import KMeans\n",
 87 |       "from numpy.random import RandomState\n",
 88 |       "rng = RandomState(42)\n",
 89 |       "\n",
 90 |       "kmeans = KMeans(n_clusters=3, random_state=rng).fit(X_pca)"
 91 |      ],
 92 |      "language": "python",
 93 |      "metadata": {},
 94 |      "outputs": []
 95 |     },
 96 |     {
 97 |      "cell_type": "code",
 98 |      "collapsed": false,
 99 |      "input": [
100 |       "import numpy as np\n",
101 |       "np.round(kmeans.cluster_centers_, decimals=2)"
102 |      ],
103 |      "language": "python",
104 |      "metadata": {},
105 |      "outputs": []
106 |     },
107 |     {
108 |      "cell_type": "code",
109 |      "collapsed": false,
110 |      "input": [
111 |       "kmeans.labels_[:10]"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "code",
119 |      "collapsed": false,
120 |      "input": [
121 |       "kmeans.labels_[-10:]"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": []
126 |     },
127 |     {
128 |      "cell_type": "markdown",
129 |      "metadata": {},
130 |      "source": [
131 |       "The K-means algorithm has been used to infer cluster labels for the\n",
132 |       "points.  Let's call the ``plot_2D`` function again, but color the points\n",
133 |       "based on the cluster labels rather than the iris species."
134 |      ]
135 |     },
136 |     {
137 |      "cell_type": "code",
138 |      "collapsed": false,
139 |      "input": [
140 |       "plot_2D(X_pca, kmeans.labels_, [\"c0\", \"c1\", \"c2\"])\n",
141 |       "\n",
142 |       "plot_2D(X_pca, iris.target, iris.target_names)"
143 |      ],
144 |      "language": "python",
145 |      "metadata": {},
146 |      "outputs": []
147 |     },
148 |     {
149 |      "cell_type": "heading",
150 |      "level": 3,
151 |      "metadata": {},
152 |      "source": [
153 |       "Exercise"
154 |      ]
155 |     },
156 |     {
157 |      "cell_type": "markdown",
158 |      "metadata": {},
159 |      "source": [
160 |       "Perform the K-Means cluster search again, but this time learn the\n",
161 |       "clusters using the full data matrix ``X``, rather than the projected\n",
162 |       "matrix ``X_pca``.  Does this change the results?  Do these labels\n",
163 |       "look closer to the true labels?"
164 |      ]
165 |     },
166 |     {
167 |      "cell_type": "code",
168 |      "collapsed": false,
169 |      "input": [],
170 |      "language": "python",
171 |      "metadata": {},
172 |      "outputs": []
173 |     }
174 |    ],
175 |    "metadata": {}
176 |   }
177 |  ]
178 | }


--------------------------------------------------------------------------------
/doc/notebooks/05_iris_crossval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "05_iris_crossval"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Cross-Validation on the Iris Dataset"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Here is an example on you to split the data on the iris dataset."
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "markdown",
 27 |      "metadata": {},
 28 |      "source": [
 29 |       "Let's re-use the results of the 2D PCA of the iris dataset\n",
 30 |       "in order to explore clustering. First we need to repeat\n",
 31 |       "some of the code from the previous notebook:"
 32 |      ]
 33 |     },
 34 |     {
 35 |      "cell_type": "code",
 36 |      "collapsed": false,
 37 |      "input": [
 38 |       "# all of this is taken from the notebook '04_iris_clustering.ipynb'\n",
 39 |       "import numpy as np\n",
 40 |       "from sklearn.datasets import load_iris\n",
 41 |       "\n",
 42 |       "iris = load_iris()\n",
 43 |       "X = iris.data\n",
 44 |       "y = iris.target\n",
 45 |       "\n",
 46 |       "n_samples, n_features = iris.data.shape\n",
 47 |       "print n_samples"
 48 |      ],
 49 |      "language": "python",
 50 |      "metadata": {},
 51 |      "outputs": []
 52 |     },
 53 |     {
 54 |      "cell_type": "markdown",
 55 |      "metadata": {},
 56 |      "source": [
 57 |       "First we need to shuffle the order of the samples and the\n",
 58 |       "target to ensure that all classes are well represented on\n",
 59 |       "both sides of the split:"
 60 |      ]
 61 |     },
 62 |     {
 63 |      "cell_type": "code",
 64 |      "collapsed": false,
 65 |      "input": [
 66 |       "indices = np.arange(n_samples)\n",
 67 |       "indices[:10]"
 68 |      ],
 69 |      "language": "python",
 70 |      "metadata": {},
 71 |      "outputs": []
 72 |     },
 73 |     {
 74 |      "cell_type": "code",
 75 |      "collapsed": false,
 76 |      "input": [
 77 |       "np.random.RandomState(42).shuffle(indices)\n",
 78 |       "indices[:10]"
 79 |      ],
 80 |      "language": "python",
 81 |      "metadata": {},
 82 |      "outputs": []
 83 |     },
 84 |     {
 85 |      "cell_type": "code",
 86 |      "collapsed": false,
 87 |      "input": [
 88 |       "X = iris.data[indices]\n",
 89 |       "y = iris.target[indices]"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": []
 94 |     },
 95 |     {
 96 |      "cell_type": "markdown",
 97 |      "metadata": {},
 98 |      "source": [
 99 |       "We can now split the data using a 2/3 - 1/3 ratio:"
100 |      ]
101 |     },
102 |     {
103 |      "cell_type": "code",
104 |      "collapsed": false,
105 |      "input": [
106 |       "split = (n_samples * 2) / 3\n",
107 |       "\n",
108 |       "X_train, X_test = X[:split], X[split:]\n",
109 |       "y_train, y_test = y[:split], y[split:]\n",
110 |       "\n",
111 |       "X_train.shape"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "code",
119 |      "collapsed": false,
120 |      "input": [
121 |       "X_test.shape"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": []
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "y_train.shape"
132 |      ],
133 |      "language": "python",
134 |      "metadata": {},
135 |      "outputs": []
136 |     },
137 |     {
138 |      "cell_type": "code",
139 |      "collapsed": false,
140 |      "input": [
141 |       "y_test.shape"
142 |      ],
143 |      "language": "python",
144 |      "metadata": {},
145 |      "outputs": []
146 |     },
147 |     {
148 |      "cell_type": "markdown",
149 |      "metadata": {},
150 |      "source": [
151 |       "We can now re-train a new linear classifier on the training set only:"
152 |      ]
153 |     },
154 |     {
155 |      "cell_type": "code",
156 |      "collapsed": false,
157 |      "input": [
158 |       "from sklearn.svm import LinearSVC\n",
159 |       "clf = LinearSVC().fit(X_train, y_train)"
160 |      ],
161 |      "language": "python",
162 |      "metadata": {},
163 |      "outputs": []
164 |     },
165 |     {
166 |      "cell_type": "markdown",
167 |      "metadata": {},
168 |      "source": [
169 |       "To evaluate its quality we can compute the average number\n",
170 |       "of correct classifications on the test set:"
171 |      ]
172 |     },
173 |     {
174 |      "cell_type": "code",
175 |      "collapsed": false,
176 |      "input": [
177 |       "np.mean(clf.predict(X_test) == y_test)"
178 |      ],
179 |      "language": "python",
180 |      "metadata": {},
181 |      "outputs": []
182 |     },
183 |     {
184 |      "cell_type": "markdown",
185 |      "metadata": {},
186 |      "source": [
187 |       "This shows that the model has a predictive accurracy of 100%\n",
188 |       "which means that the classification model was perfectly capable\n",
189 |       "of generalizing what was learned from the training set to the\n",
190 |       "test set: this is rarely so easy on real life datasets as we\n",
191 |       "will see in the later sections."
192 |      ]
193 |     }
194 |    ],
195 |    "metadata": {}
196 |   }
197 |  ]
198 | }


--------------------------------------------------------------------------------
/doc/notebooks/08_regression_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "08_regression_example"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Regression Example"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "As with the previous example, this notebook assumes that the current\n",
 23 |       "working directory is in the scikit-learn tutorial directory where\n",
 24 |       "the notebook is stored. In the folder\n",
 25 |       "\n",
 26 |       "     ../data/sdss_photoz\n",
 27 |       "\n",
 28 |       "there is a script fetch_data.py which will download the colors of 400,000+ galaxies from the Sloan Digital Sky Survey. This script also includes a python implementation of the SQL query used to construct this data. This template can be modified to download more features if desired. Before executing the example below, run fetch_data.py to download the colors and redshifts.\n",
 29 |       "\n",
 30 |       "If you're using a different directory structure, then the DATA_HOME variable in the following script should be set accordingly."
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "code",
 35 |      "collapsed": false,
 36 |      "input": [
 37 |       "import os\n",
 38 |       "DATA_HOME = os.path.abspath('../data/sdss_photoz/')"
 39 |      ],
 40 |      "language": "python",
 41 |      "metadata": {},
 42 |      "outputs": []
 43 |     },
 44 |     {
 45 |      "cell_type": "markdown",
 46 |      "metadata": {},
 47 |      "source": [
 48 |       "First we will load this data, shuffle it in preparation for later, and arrange the colors in an array of shape (n_samples, n_features):"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "import numpy as np\n",
 56 |       "data = np.load(os.path.join(DATA_HOME,'sdss_photoz.npy'))"
 57 |      ],
 58 |      "language": "python",
 59 |      "metadata": {},
 60 |      "outputs": []
 61 |     },
 62 |     {
 63 |      "cell_type": "markdown",
 64 |      "metadata": {},
 65 |      "source": [
 66 |       "The data is in a record array, as in the classification example"
 67 |      ]
 68 |     },
 69 |     {
 70 |      "cell_type": "code",
 71 |      "collapsed": false,
 72 |      "input": [
 73 |       "print data.dtype.names"
 74 |      ],
 75 |      "language": "python",
 76 |      "metadata": {},
 77 |      "outputs": []
 78 |     },
 79 |     {
 80 |      "cell_type": "markdown",
 81 |      "metadata": {},
 82 |      "source": [
 83 |       "Now we'll set up our data matrix ``X`` and redshift ``z``"
 84 |      ]
 85 |     },
 86 |     {
 87 |      "cell_type": "code",
 88 |      "collapsed": false,
 89 |      "input": [
 90 |       "N = len(data)\n",
 91 |       "X = np.zeros((N, 4))\n",
 92 |       "X[:, 0] = data['u'] - data['g']\n",
 93 |       "X[:, 1] = data['g'] - data['r']\n",
 94 |       "X[:, 2] = data['r'] - data['i']\n",
 95 |       "X[:, 3] = data['i'] - data['z']\n",
 96 |       "z = data['redshift']"
 97 |      ],
 98 |      "language": "python",
 99 |      "metadata": {},
100 |      "outputs": []
101 |     },
102 |     {
103 |      "cell_type": "markdown",
104 |      "metadata": {},
105 |      "source": [
106 |       "Next we\u2019ll split the data into two samples: a training sample and a test sample which we\u2019ll use to evaluate our training:"
107 |      ]
108 |     },
109 |     {
110 |      "cell_type": "code",
111 |      "collapsed": false,
112 |      "input": [
113 |       "Ntrain = 3 * N / 4\n",
114 |       "Xtrain = X[:Ntrain]\n",
115 |       "ztrain = z[:Ntrain]\n",
116 |       "Xtest = X[Ntrain:]\n",
117 |       "ztest = z[Ntrain:]"
118 |      ],
119 |      "language": "python",
120 |      "metadata": {},
121 |      "outputs": []
122 |     },
123 |     {
124 |      "cell_type": "markdown",
125 |      "metadata": {},
126 |      "source": [
127 |       "Now we\u2019ll use the scikit-learn ``DecisionTreeRegressor`` method\n",
128 |       "to train a model and predict redshifts for the test set based\n",
129 |       "on a 20-level decision tree:"
130 |      ]
131 |     },
132 |     {
133 |      "cell_type": "code",
134 |      "collapsed": false,
135 |      "input": [
136 |       "from sklearn.tree import DecisionTreeRegressor\n",
137 |       "clf = DecisionTreeRegressor(max_depth=20)\n",
138 |       "clf.fit(Xtrain, ztrain)\n",
139 |       "zpred = clf.predict(Xtest)"
140 |      ],
141 |      "language": "python",
142 |      "metadata": {},
143 |      "outputs": []
144 |     },
145 |     {
146 |      "cell_type": "markdown",
147 |      "metadata": {},
148 |      "source": [
149 |       "To judge the efficacy of prediction, we can compute the\n",
150 |       "root-mean-square (RMS) difference between the true and predicted values:"
151 |      ]
152 |     },
153 |     {
154 |      "cell_type": "code",
155 |      "collapsed": false,
156 |      "input": [
157 |       "rms = np.sqrt(np.mean((ztest - zpred) ** 2))\n",
158 |       "print rms"
159 |      ],
160 |      "language": "python",
161 |      "metadata": {},
162 |      "outputs": []
163 |     },
164 |     {
165 |      "cell_type": "markdown",
166 |      "metadata": {},
167 |      "source": [
168 |       "Our RMS error is about 0.22. This is pretty good for such an unsophisticated\n",
169 |       "learning algorithm, but better algorithms can improve on this. The biggest\n",
170 |       "issue here are the catastrophic errors, where the predicted redshift is\n",
171 |       "extremely far from the prediction:"
172 |      ]
173 |     },
174 |     {
175 |      "cell_type": "code",
176 |      "collapsed": false,
177 |      "input": [
178 |       "print \"Number of test samples:       \", len(ztest)\n",
179 |       "print \"Number of catastrophic errors:\", np.sum(abs(ztest - zpred) > 1)"
180 |      ],
181 |      "language": "python",
182 |      "metadata": {},
183 |      "outputs": []
184 |     },
185 |     {
186 |      "cell_type": "markdown",
187 |      "metadata": {},
188 |      "source": [
189 |       "About 1.5% of objects have redshift estimates which are off by greater than 1.\n",
190 |       "This sort of error in redshift determination is very problematic for\n",
191 |       "high-precision cosmological studies. This can be seen in a scatter plot of\n",
192 |       "the predicted redshift versus the true redshift for the test data:"
193 |      ]
194 |     },
195 |     {
196 |      "cell_type": "code",
197 |      "collapsed": false,
198 |      "input": [
199 |       "%pylab inline\n",
200 |       "import pylab as pl"
201 |      ],
202 |      "language": "python",
203 |      "metadata": {},
204 |      "outputs": []
205 |     },
206 |     {
207 |      "cell_type": "code",
208 |      "collapsed": false,
209 |      "input": [
210 |       "ax = pl.axes()\n",
211 |       "\n",
212 |       "pl.scatter(ztest, zpred, c='k', lw=0, s=4)\n",
213 |       "axis_lim = np.array([0, 2.5])\n",
214 |       "\n",
215 |       "# plot the true redshift\n",
216 |       "pl.plot(axis_lim, axis_lim, '--k')\n",
217 |       "\n",
218 |       "# plot +/- the rms\n",
219 |       "pl.plot(axis_lim, axis_lim + rms, '--r')  \n",
220 |       "pl.plot(axis_lim, axis_lim - rms, '--r')\n",
221 |       "pl.xlim(axis_lim)\n",
222 |       "pl.ylim(axis_lim)\n",
223 |       "\n",
224 |       "pl.title('Photo-z: Decision Tree Regression')\n",
225 |       "pl.xlabel(r'$\\mathrm{z_{true}}$', fontsize=14)\n",
226 |       "pl.ylabel(r'$\\mathrm{z_{phot}}$', fontsize=14)"
227 |      ],
228 |      "language": "python",
229 |      "metadata": {},
230 |      "outputs": []
231 |     },
232 |     {
233 |      "cell_type": "markdown",
234 |      "metadata": {},
235 |      "source": [
236 |       "The true and predicted redshifts of 102,798 SDSS galaxies, using a simple decision tree regressor. Notice the presece of catastrophic outliers: those galaxies whose predicted redshifts are extremely far from the true value.\n",
237 |       "\n",
238 |       "Later, in Exercise #2, we will attempt to improve on this by optimizing the parameters of the decision tree.\n",
239 |       "\n",
240 |       "In practice, the solutions to the photometric redshift problem can benefit from approaches that use physical intuition as well as machine learning tools. For example, some solutions involve the use of libraries of synthetic galaxy spectra which are known to be representative of the true galaxy distribution. This extra information can be used either directly, in a physically motivated analysis, or can be used to generate a larger suite of artificial training instances for a pure machine learning approach."
241 |      ]
242 |     }
243 |    ],
244 |    "metadata": {}
245 |   }
246 |  ]
247 | }


--------------------------------------------------------------------------------
/doc/notebooks/nbconvert.py:
--------------------------------------------------------------------------------
 1 | """Simple utility script for semi-gracefully downgrading v3 notebooks to v2"""
 2 | 
 3 | import io
 4 | import os
 5 | import sys
 6 | 
 7 | from IPython.nbformat import current
 8 | 
 9 | def heading_to_md(cell):
10 |     """turn heading cell into corresponding markdown"""
11 |     cell.cell_type = "markdown"
12 |     level = cell.pop('level', 1)
13 |     cell.source = '#'*level + ' ' + cell.source
14 | 
15 | def raw_to_md(cell):
16 |     """let raw passthrough as markdown"""
17 |     cell.cell_type = "markdown"
18 | 
19 | def downgrade(nb):
20 |     """downgrade a v3 notebook to v2"""
21 |     if nb.nbformat != 3:
22 |         return nb
23 |     nb.nbformat = 2
24 |     for ws in nb.worksheets:
25 |         for cell in ws.cells:
26 |             if cell.cell_type == 'heading':
27 |                 heading_to_md(cell)
28 |             elif cell.cell_type == 'raw':
29 |                 raw_to_md(cell)
30 |     return nb
31 | 
32 | def downgrade_ipynb(fname):
33 |     base, ext = os.path.splitext(fname)
34 |     newname = base+'.v2'+ext
35 |     print "downgrading %s -> %s" % (fname, newname)
36 |     with io.open(fname, 'r', encoding='utf8') as f:
37 |         nb = current.read(f, 'json')
38 |     nb = downgrade(nb)
39 |     with open(newname, 'w') as f:
40 |         current.write(nb, f, 'json')
41 | 
42 | if __name__ == '__main__':
43 |     map(downgrade_ipynb, sys.argv[1:])
44 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/01-01.py:
--------------------------------------------------------------------------------
1 | # 01-01.py
2 | clf_0 = gmm.GMM(1, 'diag')
3 | i0 = (y_train == 0)
4 | clf_0.fit(X_train[i0])
5 | 
6 | clf_1 = gmm.GMM(1, 'diag')
7 | i1 = (y_train == 1)
8 | clf_1.fit(X_train[i1])
9 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/01-02.py:
--------------------------------------------------------------------------------
1 | # 01-02.py
2 | num0 = i0.sum()
3 | num1 = i1.sum()
4 | 
5 | prior0 = num0 / float(Ntrain)
6 | prior1 = num1 / float(Ntrain)
7 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/01-03.py:
--------------------------------------------------------------------------------
1 | # 01-03.py
2 | logL = np.zeros((2, Ncrossval))
3 | logL[0] = clf_0.score(X_crossval) + np.log(prior0)
4 | logL[1] = clf_1.score(X_crossval) + np.log(prior1)
5 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/01-04.py:
--------------------------------------------------------------------------------
 1 | # 01-04.py
 2 | def GMMBayes(X_test, n_components, covariance_type):
 3 |     clf_0 = gmm.GMM(n_components, covariance_type, random_state=0)
 4 |     i0 = (y_train == 0)
 5 |     clf_0.fit(X_train[i0])
 6 | 
 7 |     clf_1 = gmm.GMM(n_components, covariance_type, random_state=0)
 8 |     i1 = (y_train == 1)
 9 |     clf_1.fit(X_train[i1])
10 | 
11 |     logL = np.zeros((2, X_test.shape[0]))
12 |     logL[0] = clf_0.score(X_test) + np.log(prior0)
13 |     logL[1] = clf_1.score(X_test) + np.log(prior1)
14 | 
15 |     y_pred = np.argmax(logL, 0)
16 | 
17 |     return y_pred
18 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/01-05.py:
--------------------------------------------------------------------------------
1 | # 01-05.py
2 | y_pred_gmm = GMMBayes(X_test, 5, 'full')
3 | y_pred_gnb = gnb.predict(X_test)
4 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/02-01.py:
--------------------------------------------------------------------------------
 1 | # 02-01.py
 2 | for i, max_depth in enumerate(max_depth_array):
 3 |     # print progress update
 4 |     print '%i / %i' % (max_depth, max_depth_array[-1])
 5 | 
 6 |     clf = DecisionTreeRegressor(max_depth=max_depth)
 7 |     clf.fit(X_train, y_train)
 8 | 
 9 |     y_train_pred = clf.predict(X_train)
10 |     y_cv_pred = clf.predict(X_cv)
11 | 
12 |     train_error[i] = compute_rms_error(y_train_pred, y_train)
13 |     cv_error[i] = compute_rms_error(y_cv_pred, y_cv)
14 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/02-02.py:
--------------------------------------------------------------------------------
 1 | #02-02.py
 2 | for i, n_samples in enumerate(n_samples_array):
 3 |     # print progress update
 4 |     print ' %i / %i' % (n_samples, Ntrain)
 5 | 
 6 |     clf = DecisionTreeRegressor(max_depth=max_depth)
 7 |     clf.fit(X_train[:n_samples], y_train[:n_samples])
 8 | 
 9 |     y_train_pred = clf.predict(X_train[:n_samples])
10 |     y_cv_pred = clf.predict(X_cv)
11 | 
12 |     train_error_2[i] = compute_rms_error(y_train_pred,
13 |                                        y_train[:n_samples])
14 |     cv_error_2[i] = compute_rms_error(y_cv_pred, y_cv)
15 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/02-03a.py:
--------------------------------------------------------------------------------
 1 | #02-03a.py
 2 | 
 3 | #------------------------------------------------------------
 4 | # first compute and plot the outlier fraction as a function
 5 | # of max_depth
 6 | max_depth_array = np.arange(1, 21)
 7 | train_error = np.zeros(len(max_depth_array))
 8 | cv_error = np.zeros(len(max_depth_array))
 9 | 
10 | for i, max_depth in enumerate(max_depth_array):
11 |     # print progress update
12 |     print '%i / %i' % (max_depth, max_depth_array[-1])
13 | 
14 |     clf = DecisionTreeRegressor(max_depth=max_depth)
15 |     clf.fit(X_train, y_train)
16 | 
17 |     y_train_pred = clf.predict(X_train)
18 |     y_cv_pred = clf.predict(X_cv)
19 | 
20 |     train_error[i] = compute_outlier_fraction(y_train_pred, y_train)
21 |     cv_error[i] = compute_outlier_fraction(y_cv_pred, y_cv)
22 | 
23 | pl.figure()
24 | pl.plot(max_depth_array, cv_error, label='cross-val error')
25 | pl.plot(max_depth_array, train_error, label='training error')
26 | 
27 | pl.legend(loc=0)
28 | pl.xlabel('max depth')
29 | pl.ylabel('error')
30 | 
31 | # select the value of max_depth which led to the best results
32 | max_depth = max_depth_array[np.argmin(cv_error)]
33 | print "max_depth = %i" % max_depth
34 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/02-03b.py:
--------------------------------------------------------------------------------
 1 | #02-03b.py
 2 | 
 3 | #------------------------------------------------------------
 4 | # compute and plot the outlier fraction
 5 | # as a function of number of samples
 6 | n_samples_array = np.linspace(50, Ntrain, 20).astype(int)
 7 | train_error_2 = np.zeros(n_samples_array.shape)
 8 | cv_error_2 = np.zeros(n_samples_array.shape)
 9 | 
10 | for i, n_samples in enumerate(n_samples_array):
11 |     # print progress update
12 |     print ' %i / %i' % (n_samples, Ntrain)
13 | 
14 |     clf = DecisionTreeRegressor(max_depth=max_depth)
15 |     clf.fit(X_train[:n_samples], y_train[:n_samples])
16 | 
17 |     y_train_pred = clf.predict(X_train[:n_samples])
18 |     y_cv_pred = clf.predict(X_cv)
19 | 
20 |     train_error_2[i] = compute_outlier_fraction(y_train_pred,
21 |                                        y_train[:n_samples])
22 |     cv_error_2[i] = compute_outlier_fraction(y_cv_pred, y_cv)
23 |     
24 | pl.figure()
25 | pl.plot(n_samples_array, cv_error_2, label='cross-val error')
26 | pl.plot(n_samples_array, train_error_2, label='training error')
27 | 
28 | pl.legend(loc=0)
29 | pl.xlabel('number of samples')
30 | pl.ylabel('error')
31 | 
32 | pl.title('max_depth = %s' % max_depth)
33 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/03-01.py:
--------------------------------------------------------------------------------
 1 | #03-01.py
 2 | X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None)
 3 | 
 4 | from sklearn.manifold import LocallyLinearEmbedding
 5 | lle = LocallyLinearEmbedding(n_neighbors=15,
 6 |                              n_components=3, method='standard')
 7 | X_proj = lle.fit_transform(X)
 8 | 
 9 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True)
10 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/03-02.py:
--------------------------------------------------------------------------------
 1 | #03-02.py
 2 | X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None)
 3 | 
 4 | from sklearn.manifold import LocallyLinearEmbedding
 5 | lle = LocallyLinearEmbedding(n_neighbors=15,
 6 |                              n_components=3, method='modified')
 7 | X_proj = lle.fit_transform(X)
 8 | 
 9 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True)
10 | 


--------------------------------------------------------------------------------
/doc/notebooks/soln/03-03.py:
--------------------------------------------------------------------------------
1 | #03-03.py
2 | X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None)
3 | 
4 | from sklearn.manifold import Isomap
5 | iso = Isomap(n_neighbors=15, n_components=3)
6 | X_proj = iso.fit_transform(X)
7 | 
8 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True)
9 | 


--------------------------------------------------------------------------------
/doc/regression.rst:
--------------------------------------------------------------------------------
  1 | .. _astronomy_regression:
  2 | 
  3 | =============================================
  4 | Regression: Photometric Redshifts of Galaxies
  5 | =============================================
  6 | 
  7 | Another important learning task in astronomy is the problem of determining
  8 | `redshifts <http://en.wikipedia.org/wiki/Redshift>`_ of distant galaxies.
  9 | In the current standard cosmological model, the
 10 | universe began nearly 14 billion years ago, in an explosive event commonly
 11 | known as the Big Bang.  Since then, the very fabric of space has been
 12 | `expanding <http://en.wikipedia.org/wiki/Expansion_of_space>`_,
 13 | so that distant galaxies appear to be moving away from us at
 14 | very high speeds.  The uniformity of this expansion means that there is
 15 | a relationship between the distance to a galaxy, and the speed that it
 16 | appears to be receeding from us (this relationship is known as 
 17 | `Hubble's Law <http://en.wikipedia.org/wiki/Hubble_expansion>`_, named
 18 | after Edwin Hubble).  This recession speed leads to a shift
 19 | in the frequency of photons, very similar to the more familiar
 20 | `doppler shift <http://en.wikipedia.org/wiki/Doppler_shift>`_ that
 21 | causes the pitch of a siren to change as an emergency vehicle passes by.
 22 | If a galaxy or star were
 23 | moving toward us, its light would be shifted to higher frequencies, or
 24 | `blue-shifted`.  Because the universe is expanding away from us, distant
 25 | galaxies appear to be `red-shifted`: their photons are shifted to lower
 26 | frequencies.
 27 | 
 28 | In cosmology, the redshift is measured with the parameter :math:`z`, defined
 29 | in terms of the observed wavelength :math:`\lambda_{obs}` and the emitted
 30 | wavelength :math:`\lambda_{em}`:
 31 | 
 32 | .. math::
 33 |    \lambda_{obs} = (1 + z)\lambda_{em}
 34 | 
 35 | When a spectrum can be obtained, determining the redshift is rather
 36 | straight-forward: if you can localize the spectral fingerprint of a common
 37 | element, such as hydrogen, then the redshift can be computed using simple
 38 | arithmetic.  But similarly to the case of Star/Quasar classification, the
 39 | task becomes much more difficult when only photometric observations are
 40 | available.
 41 | 
 42 | Because of the spectrum shift, an identical source at different redshifts
 43 | will have a different color through each pair of filters.  See the following
 44 | figure:
 45 | 
 46 | .. figure:: auto_examples/images/plot_sdss_filters_2.png
 47 |    :target: auto_examples/plot_sdss_filters.html
 48 |    :align: center
 49 |    :scale: 80%
 50 | 
 51 |    The spectrum of the star Vega (:math:`\alpha`-Lyr) at three different
 52 |    redshifts.  The SDSS ugriz filters are shown in gray for reference.
 53 | 
 54 | At redshift :math:`z=0.0`, the spectrum is bright in the `u` and `g` filters,
 55 | but dim in the `i` and `z` filters.  At redshift :math:`z=0.8`, the opposite
 56 | is the case.  This suggests the possibility of determining redshift from
 57 | photometry alone.  The situation is complicated by the fact that each
 58 | individual source has unique spectral characteristics, but nevertheless,
 59 | these `photometric redshifts` are often used in astronomical applications.
 60 | 
 61 | Motivation: Dark Energy, Dark Matter, and the Fate of the Universe
 62 | ------------------------------------------------------------------
 63 | The photometric redshift problem is very important.  Future astronomical
 64 | surveys hope to image trillions of very faint galaxies, and use this data
 65 | to inform our view of the universe as a whole: its history, its geometry,
 66 | and its fate.  Obtaining an accurate estimate of the redshift to each of these
 67 | galaxies is a pivotal part of this task.  Because these surveys will image
 68 | so many extremely faint galaxies, there is no possibility of obtaining a
 69 | spectrum for each one.  Thus sophisticated photometric redshift codes will
 70 | be required to advance our understanding of the Universe, including more
 71 | precisely understanding the nature of the dark energy that is currently
 72 | accelerating the cosmic expansion.
 73 | 
 74 | A Simple Method: Decision Tree Regression
 75 | -----------------------------------------
 76 | 
 77 | .. currentmodule:: sklearn.tree
 78 | 
 79 | .. note::
 80 | 
 81 |    The information in this section is available in an interactive notebook
 82 |    :download:`08_regression_example.ipynb <notebooks/08_regression_example.ipynb>`,
 83 |    which can be viewed using `iPython notebook`_.    An online static view can
 84 |    be seen `here <http://nbviewer.ipython.org/url/astroml.github.com/sklearn_tutorial/_downloads/08_regression_example.ipynb>`_.
 85 | 
 86 | Here we'll take an extremely simple approach to the photometric redshift
 87 | problem, using a decision tree.
 88 | In the folder ``$TUTORIAL_HOME/data/sdss_photoz``, there is a script
 89 | ``fetch_data.py`` which will download the colors of 400,000+ galaxies from
 90 | the Sloan Digital Sky Survey.  This script also includes a python
 91 | implementation of the SQL query used to construct this data.  This template
 92 | can be modified to download more features if desired.
 93 | Before executing the example below, run ``fetch_data.py``
 94 | to download the colors and redshifts.
 95 | 
 96 | First we will load this data, shuffle it in preparation for later, and arrange
 97 | the colors in an array of shape ``(n_samples, n_features)``::
 98 | 
 99 |    >>> import numpy as np
100 |    >>> data = np.load('data/sdss_photoz/sdss_photoz.npy')
101 |    >>> N = len(data)
102 |    >>> X = np.zeros((N, 4))
103 |    >>> X[:, 0] = data['u'] - data['g']
104 |    >>> X[:, 1] = data['g'] - data['r']
105 |    >>> X[:, 2] = data['r'] - data['i']
106 |    >>> X[:, 3] = data['i'] - data['z']
107 |    >>> z = data['redshift']
108 | 
109 | Next we'll split the data into two samples: a training sample and a test
110 | sample which we'll use to evaluate our training::
111 | 
112 |    >>> Ntrain = 3 * N / 4
113 |    >>> Xtrain = X[:Ntrain]
114 |    >>> ztrain = z[:Ntrain]
115 |    >>> Xtest = X[Ntrain:]
116 |    >>> ztest = z[Ntrain:]
117 | 
118 | Now we'll use the scikit-learn :class:`DecisionTreeRegressor` method to
119 | train a model and predict redshifts for the test set based on a
120 | 20-level decision tree::
121 | 
122 |    >>> from sklearn.tree import DecisionTreeRegressor
123 |    >>> clf = DecisionTreeRegressor(
124 |    >>> clf.fit(Xtrain, ztrain)
125 |    >>> zpred = clf.predict(Xtest)
126 |    
127 | To judge the efficacy of prediction, we can compute the root-mean-square
128 | difference between the true and predicted values::
129 | 
130 |    >>> rms = np.sqrt(np.mean((ztest - zpred) ** 2))
131 |    >>> print rms
132 |    0.221409442926
133 | 
134 | Our RMS error is about 0.22.  This is pretty good for such an unsophisticated
135 | learning algorithm, but better algorithms can improve on this.  The biggest
136 | issue here are the `catastrophic errors`, where the predicted redshift is
137 | extremely far from the prediction::
138 | 
139 |    >>> print len(ztest)
140 |    102798
141 |    >>> print np.sum(abs(ztest - zpred) > 1)
142 |    1538
143 | 
144 | About 1.5% of objects have redshift estimates which are off by greater than
145 | 1.  This sort of error in redshift determination is very problematic for
146 | high-precision cosmological studies.  This can be seen in a scatter plot of
147 | the predicted redshift versus the true redshift for the test data:
148 | 
149 | .. figure:: auto_examples/images/plot_sdss_photoz_1.png
150 |    :target: auto_examples/plot_sdss_photoz.html
151 |    :align: center
152 |    :scale: 80%
153 | 
154 |    The true and predicted redshifts of 102,798 SDSS galaxies, using a simple
155 |    decision tree regressor.  Notice the presece of catastrophic outliers:
156 |    those galaxies whose predicted redshifts are extremely far from the true
157 |    value.
158 | 
159 | Later, in :ref:`Exercise #2 <astro_exercise_2>`, we will attempt
160 | to improve on this by optimizing the parameters of the decision tree.
161 | 
162 | In practice, the solutions to the photometric redshift problem can benefit from
163 | approaches that use physical intuition as well as machine learning tools.
164 | For example, some solutions involve the use of libraries of synthetic
165 | galaxy spectra
166 | which are known to be representative of the true galaxy distribution.  This
167 | extra information can be used either directly, in a physically motivated
168 | analysis, or can be used to generate a larger suite of artificial training
169 | instances for a pure machine learning approach.
170 | 
171 | .. _`iPython notebook`: http://ipython.org/ipython-doc/stable/interactive/htmlnotebook.html
172 | 


--------------------------------------------------------------------------------
/doc/scikitlearn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/scikitlearn.png


--------------------------------------------------------------------------------
/doc/setup.rst:
--------------------------------------------------------------------------------
  1 | .. _sklearn_tutorial_setup:
  2 | 
  3 | ===============================
  4 | Tutorial Setup and Installation
  5 | ===============================
  6 | 
  7 | .. topic:: Objectives
  8 | 
  9 |    At the end of this section, you will
 10 |   
 11 |    1. Have scikit-learn and all the prerequisites and dependencies for
 12 |       this tutorial installed on your machine.
 13 |    2. Download the source files and data required for this tutorial
 14 | 
 15 | Python Prerequisites
 16 | --------------------
 17 | 
 18 | This tutorial is based on scikit-learn, which has the following dependencies:
 19 | 
 20 | - `numpy <http://numpy.scipy.org>`_ : this is a python module which has powerful
 21 |   tools for the creation and manipulation of arrays.  It is the foundation of
 22 |   most scientific computing packages in python
 23 | 
 24 | - `scipy <http://www.scipy.org>`_ : this is a python module which builds on
 25 |   numpy and provides fast implementations of many basic scientific algorithms.
 26 | 
 27 | - `matplotlib <http://matplotlib.sourceforge.net/>`_ : this is a powerful
 28 |   package for generating plots, figures, and diagrams.  Our main form of
 29 |   visual interaction with data and results depends on matplotlib.
 30 | 
 31 | We will also make extensive use of `iPython <http://ipython.org>`_, an
 32 | interactive python interpreter.  In particular, much of the interactive
 33 | material requires `ipython notebook`_ functionality,
 34 | which was introduced in ipython version 0.12.
 35 | 
 36 | Installing scikit-learn and Dependencies
 37 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | Please refer to the `install page <http://www.scikit-learn.org/stable/install.html#installing-an-official-release>`_ for
 40 | per-system instructions on installing scikit-learn.  In addition to
 41 | ``numpy``, ``scipy``, and ``scikit-learn``, this tutorial will assume that
 42 | you have ``matplotlib`` and ``ipython`` installed as well.
 43 | 
 44 |   * Under **Debian or Ubuntu Linux** you should use::
 45 | 
 46 |       % sudo apt-get install build-essential python-dev python-numpy \
 47 |         python-numpy-dev python-scipy libatlas-dev g++ python-matplotlib \
 48 |         ipython
 49 | 
 50 |     If you are under Ubuntu 12.04+, then you can use
 51 |       
 52 |       % sudo apt-get install ipython-notebook
 53 | 
 54 |     to install ipython notebook with all dependencies.
 55 | 
 56 |   * Under **MacOSX** you should probably use a scientific python distribution
 57 |     such as `Scipy Superpack`_
 58 | 
 59 |   * Under **Windows** the `Python(x,y)`_ is probably your best bet to get a
 60 |     working numpy / scipy environment up and running.
 61 | 
 62 |   * Power-users may wish to install bleeding edge versions of these
 63 |     packages from the source.  The source can be downloaded using
 64 |     ``git`` from the packages' respective `GitHub`_ repositories.
 65 | 
 66 | Alternatively under Windows and MaxOSX you can use the EPD_ (Enthought
 67 | Python Distribution) which is a (non-open source) packaging of the
 68 | scientific python stack.
 69 | 
 70 | .. note::
 71 | 
 72 |    that to use `ipython notebook`_, you must install ``ipython`` version
 73 |    0.12 and several other dependencies.  Refer to the ipython documentation
 74 |    for details.
 75 | 
 76 | .. _`Scipy Superpack`: http://fonnesbeck.github.com/ScipySuperpack/
 77 | .. _`Python(x,y)`: http://www.pythonxy.com/
 78 | .. _EPD: https://www.enthought.com/products/epd.php
 79 | .. _GitHub: http://www.github.com
 80 | .. _`ipython notebook`: http://ipython.org/ipython-doc/stable/interactive/htmlnotebook.html
 81 | 
 82 | 
 83 | Tutorial Files
 84 | --------------
 85 | The source code for the example files in the following pages is best
 86 | accessed through cloning the scikit-learn repository using
 87 | `git <http://git-scm.com/>`_.  Once ``git`` is installed the
 88 | command to accomplish this is::
 89 | 
 90 |     % git clone https://github.com/astroML/sklearn_tutorial
 91 | 
 92 | This creates a directory called ``sklearn_tutorial`` and copies all
 93 | the source files of this tutorial.  Most of the relevant files are
 94 | in the ``sklearn_tutorial/doc`` sub-directory.
 95 | In what follows, this directory will be named ``$TUTORIAL_HOME``. It
 96 | should contain the following folders:
 97 | 
 98 |   * ``data`` - folder to put the datasets used during the tutorial
 99 | 
100 |   * ``skeletons`` - sample incomplete scripts for the exercices
101 |     (these should be used only if ``ipython notebook`` is unavailable)
102 | 
103 |   * ``solutions`` - solutions of the exercices
104 |     (these should be used only if ``ipython notebook`` is unavailable)
105 | 
106 |   * ``notebooks`` - ipython notebooks which provide an interactive interface
107 |     to parts of this tutorial.  These contain material which is not in the
108 |     skeletons and solutions.
109 | 
110 | If you are not going to use ipython notebook to run the examples, you
111 | can copy the skeletons into a new folder named ``workspace``
112 | where you will edit your own files for the exercices while keeping
113 | the original skeletons intact::
114 | 
115 |     % cp -r skeletons workspace
116 | 
117 | 
118 | Download the datasets
119 | ---------------------
120 | 
121 | Machine Learning algorithms need data. Go to each ``$TUTORIAL_HOME/data/``
122 | sub-folder and run the ``fetch_data.py`` script from there (after
123 | having read them first).  This will download a dataset to the current
124 | directory.  This tutorial has three such datasets; they will be used
125 | in the examples and exercises later on.
126 | 
127 | To get all three datasets, run the following::
128 | 
129 |     % cd $TUTORIAL_HOME/data/sdss_colors
130 |     % python fetch_data.py
131 | 
132 |     % cd $TUTORIAL_HOME/data/sdss_photoz
133 |     % python fetch_data.py
134 | 
135 |     % cd $TUTORIAL_HOME/data/sdss_spectra
136 |     % python fetch_data.py
137 | 


--------------------------------------------------------------------------------
/doc/skeletons/exercise_01.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Astronomy Tutorial: exercise 1
  3 | 
  4 | Classification of photometric sources
  5 | 
  6 | usage: python exercise_01.py datadir
  7 | 
  8 |   - datadir is $TUTORIAL_DIR/data/sdss_colors
  9 |     This directory should contain the files:
 10 |        - sdssdr6_colors_class_train.npy
 11 |        - sdssdr6_colors_class.200000.npy
 12 | 
 13 | Description:
 14 | In the tutorial, we used a Naive Bayes Classifier to separate Quasars
 15 | And Stars.  In this exercise, we will extend this classification scheme
 16 | using Gaussian Mixture Models.
 17 | 
 18 | The Gaussian Naive Bayes method starts by fitting an N-dimensional gaussian
 19 | distribution to each class of data.  When a test point is evaluated, the
 20 | relative log-likelihood from each distribution is used to predict the most
 21 | likely value.  We're going to extend this by fitting a sum of gaussians to
 22 | each distribution.
 23 | 
 24 | There are several places in this file with code to be filled-in as part of
 25 | the exercise.  Each of these is labeled TODO below.
 26 | """
 27 | import os, sys
 28 | import numpy as np
 29 | import pylab as pl
 30 | from sklearn.mixture import gmm
 31 | from sklearn import metrics
 32 | 
 33 | try:
 34 |     datadir = sys.argv[1]
 35 | except:
 36 |     print __doc__
 37 |     sys.exit()
 38 | 
 39 | #----------------------------------------------------------------------
 40 | # Load data files
 41 | train_data = np.load(os.path.join(datadir,
 42 |                                   'sdssdr6_colors_class_train.npy'))
 43 | test_data = np.load(os.path.join(datadir,
 44 |                                  'sdssdr6_colors_class.200000.npy'))
 45 | 
 46 | # set the number of training points: using all points leads to a very
 47 | # long running time.  We'll start with 10000 training points.  This
 48 | # can be increased if desired.
 49 | Ntrain = 10000
 50 | #Ntrain = len(train_data)
 51 | 
 52 | np.random.seed(0)
 53 | np.random.shuffle(train_data)
 54 | train_data = train_data[:Ntrain]
 55 | 
 56 | #----------------------------------------------------------------------
 57 | # Split training data into training and cross-validation sets
 58 | N_crossval = Ntrain / 5
 59 | train_data = train_data[:-N_crossval]
 60 | crossval_data = train_data[-N_crossval:]
 61 | 
 62 | #----------------------------------------------------------------------
 63 | # Set up data
 64 | #
 65 | X_train = np.zeros((train_data.size, 4), dtype=float)
 66 | X_train[:, 0] = train_data['u-g']
 67 | X_train[:, 1] = train_data['g-r']
 68 | X_train[:, 2] = train_data['r-i']
 69 | X_train[:, 3] = train_data['i-z']
 70 | y_train = (train_data['redshift'] > 0).astype(int)
 71 | Ntrain = len(y_train)
 72 | 
 73 | X_crossval = np.zeros((crossval_data.size, 4), dtype=float)
 74 | X_crossval[:, 0] = crossval_data['u-g']
 75 | X_crossval[:, 1] = crossval_data['g-r']
 76 | X_crossval[:, 2] = crossval_data['r-i']
 77 | X_crossval[:, 3] = crossval_data['i-z']
 78 | y_crossval = (crossval_data['redshift'] > 0).astype(int)
 79 | Ncrossval = len(y_crossval)
 80 | 
 81 | #======================================================================
 82 | # Recreating Gaussian Naive Bayes
 83 | #
 84 | #   Here we will use Gaussian Mixture Models to duplicate our Gaussian
 85 | #   Naive Bayes results from earlier.  You'll create two sklearn.gmm.GMM()
 86 | #   classifier instances, named `clf_0` and `clf_1`.  Each should be
 87 | #   initialized with a single component, and diagonal covariance.
 88 | #   (hint: look at the doc string for sklearn.gmm.GMM to see how to set
 89 | #   this up).  The results should be compared to Gaussian Naive Bayes
 90 | #   to check if they're correct.
 91 | #
 92 | #   Objects to create:
 93 | #    - clf_0 : trained on the portion of the training data with y == 0
 94 | #    - clf_1 : trained on the portion of the training data with y == 1
 95 | 
 96 | # TODO:  compute clf_0, clf_1
 97 | 
 98 | # next we must construct the prior.  The prior is the fraction of training
 99 | # points of each type.
100 | # 
101 | # variables to compute:
102 | #  - prior0 : fraction of training points with y == 0
103 | #  - prior1 : fraction of training points with y == 1
104 | 
105 | # TODO:  compute prior0, prior1
106 | 
107 | # Now we use the prior and the classifiation to compute the log-likelihoods
108 | #  of the cross-validation points.  The log likelihood is given by
109 | #
110 | #    logL(x) = clf.score(x) + log(prior)
111 | #
112 | #  You can use the function np.log() to compute the logarithm of the prior.
113 | #  variables to compute:
114 | #    logL : array, shape = (2, Ncrossval)
115 | #            logL[0] is the log-likelihood for y == 0
116 | #            logL[1] is the log-likelihood for y == 1
117 | logL = None
118 | 
119 | # TODO:  compute logL
120 | 
121 | # the predicted value for each sample is the index with the largest
122 | # log-likelihood.
123 | y_pred = np.argmax(logL, 0)
124 | 
125 | # now we print the results.  We'll use the built-in classification
126 | # report function in sklearn.metrics.  This computes the precision,
127 | # recall, and f1-score for each class.
128 | 
129 | print "------------------------------------------------------------"
130 | print "One-component Gaussian Mixture:"
131 | print "  results for cross-validation set:"
132 | print metrics.classification_report(y_crossval, y_pred,
133 |                                     target_names=['stars', 'QSOs'])
134 | 
135 | 
136 | 
137 | #----------------------------------------------------------------------
138 | #  Run Gaussian Naive Bayes to double-check that our results are correct.
139 | #  Because of rounding errors, it will not be exact, but the results should
140 | #  be very close.
141 | from sklearn.naive_bayes import GaussianNB
142 | gnb = GaussianNB()
143 | gnb.fit(X_train, y_train)
144 | y_pred = gnb.predict(X_crossval)
145 | 
146 | print "------------------------------------------------------------"
147 | print "Gaussian Naive Bayes"
148 | print "  results for cross-validation set:"
149 | print "  (results should be within ~0.01 of above results)"
150 | print metrics.classification_report(y_crossval, y_pred,
151 |                                     target_names=['stars', 'QSOs'])
152 | 
153 | #======================================================================
154 | #  Parameter optimization:
155 | #
156 | #   Now take some time to experiment with the covariance type and the
157 | #   number of components, to see if you can optimize the F1 score
158 | #
159 | #   Note that for a large number of components, the fit can take a long
160 | #   time, and will be dependent on the starting position.  Use the
161 | #   documentation string of GMM to determine the options for covariance.
162 | #
163 | #   It may be helpful to use only a subset of the training data while
164 | #   experimenting with these parameter values.  This is called
165 | #   "Meta-parameter optimization".  It can be accomplished automatically,
166 | #   but here we are doing it by hand for learning purposes.
167 | y_pred = None
168 | 
169 | # TODO:  compute y_pred for cross-validation data
170 | 
171 | print "------------------------------------------------------------"
172 | print "GMM with tweaked parameters:"
173 | print "  results for cross-validation set"
174 | print metrics.classification_report(y_crossval, y_pred,
175 |                                     target_names=['stars', 'QSOs'])
176 | 
177 | #----------------------------------------------------------------------
178 | # Test Data
179 | # once you have maximized the cross-validation, you can apply the estimator
180 | # to your test data, and check how it compares to the predicted results
181 | # from the researcher who compiled it.
182 | 
183 | X_test = np.zeros((test_data.size, 4), dtype=float)
184 | X_test[:, 0] = test_data['u-g']
185 | X_test[:, 1] = test_data['g-r']
186 | X_test[:, 2] = test_data['r-i']
187 | X_test[:, 3] = test_data['i-z']
188 | y_pred_literature = (test_data['label'] == 0).astype(int)
189 | Ntest = len(y_pred_literature)
190 | 
191 | # here you should compute y_pred for the test data, using the classifiers
192 | # clf_0 and clf_1 which you already trained above.
193 | 
194 | y_pred = None
195 | 
196 | # TODO:  compute y_pred for test data
197 | 
198 | print "------------------------------------------------------------"
199 | print "Comparison of current results with published results"
200 | print "  results for test set"
201 | print "    (treating published results as the 'true' result)"
202 | print metrics.classification_report(y_pred_literature, y_pred,
203 |                                     target_names=['stars', 'QSOs'])
204 | 


--------------------------------------------------------------------------------
/doc/skeletons/exercise_02.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Astronomy Tutorial: exercise 2
  3 | 
  4 | Photometric redshift determination
  5 | 
  6 | usage: python exercise_02.py datadir
  7 | 
  8 |   - datadir is $TUTORIAL_DIR/data/sdss_photoz
  9 |     This directory should contain the files:
 10 |        - sdss_photoz.npy
 11 | 
 12 | Here we will take a closer look at the photometric redshift problem discussed
 13 | in section 5 of the tutorial.  Using the decision tree classifier, we'll take
 14 | a look at the 4-color observations of just over 400,000 points.
 15 | 
 16 | The point of this exercise is to answer the question: how can we get the rms
 17 | error down to below 0.1?  Would it be a better use of telescope time to
 18 | observe more objects, or to observe additional features of the objects
 19 | in the data set?  We'll use the techniques discussed in section 3 of the
 20 | tutorial.
 21 | """
 22 | import os, sys
 23 | import numpy as np
 24 | import pylab as pl
 25 | 
 26 | from sklearn.tree import DecisionTreeRegressor
 27 | from sklearn import metrics
 28 | 
 29 | try:
 30 |     datadir = sys.argv[1]
 31 | except:
 32 |     print __doc__
 33 |     sys.exit()
 34 | 
 35 | def compute_rms_error(y_pred, y_true):
 36 |     """Compute the rms error between the arrays y_pred and y_true"""
 37 |     return np.sqrt(metrics.mean_squared_error(y_pred, y_true))
 38 | 
 39 | def compute_outlier_fraction(y_pred, y_true, cutoff=0.2):
 40 |     """Compute the outlier rate between the arrays y_pred and y_true"""
 41 |     return np.sum((abs(y_pred - y_true) > cutoff)) * 1. / len(y_pred)
 42 | 
 43 | #------------------------------------------------------------
 44 | # load data and compute colors
 45 | data = np.load(os.path.join(datadir, 'sdss_photoz.npy'))
 46 | 
 47 | # here we'll truncate the data to 50,000 points.  This will allow the code
 48 | # below to be run quickly while it's being written.  When you're satisfied
 49 | # that the code is ready to go, you can comment out this line.
 50 | data = data[:50000]
 51 | 
 52 | print '%i points' % data.shape[0]
 53 | u, g, r, i, z = [data[f] for f in 'ugriz']
 54 | 
 55 | X = np.zeros((len(data), 4))
 56 | X[:, 0] = u - g
 57 | X[:, 1] = g - r
 58 | X[:, 2] = r - i
 59 | X[:, 3] = i - z
 60 | 
 61 | y = data['redshift']
 62 | 
 63 | #------------------------------------------------------------
 64 | # divide into training, cross-validation, and test samples
 65 | Ntot = len(y)
 66 | 
 67 | Ncv = Ntot / 5
 68 | Ntest = Ntot / 5
 69 | Ntrain = Ntot - Ncv - Ntest
 70 | 
 71 | X_train = X[:Ntrain]
 72 | y_train = y[:Ntrain]
 73 | 
 74 | X_cv = X[Ntrain:Ntrain + Ncv]
 75 | y_cv = y[Ntrain:Ntrain + Ncv]
 76 | 
 77 | X_test = X[Ntrain + Ncv:]
 78 | y_test = y[Ntrain + Ncv:]
 79 | 
 80 | #------------------------------------------------------------
 81 | # plot the Decision Tree error as a function of max_depth parameter
 82 | #
 83 | #  This is the first main part of the exercise.  This is photometric
 84 | #  redshift determination using DecisionTreeRegressor.  Here you'll plot
 85 | #  the training error and cross-validation error as a function of the
 86 | #  meta-parameter 'max_depth'.
 87 | #
 88 | #  You will create three arrays: max_depth_array, train_error, and cv_error.
 89 | #  Use at least 10 different values of max_depth, and compute the training
 90 | #  and cross-validation error associated with each of them.
 91 | #
 92 | #  note that the error can be computed with the function compute_rms_error()
 93 | 
 94 | max_depth_array = []
 95 | train_error = []
 96 | cv_error = []
 97 | 
 98 | # TODO:  compute the arrays max_depth_array, train_error, and cv_error
 99 | 
100 | pl.figure()
101 | pl.plot(max_depth_array, cv_error, label='cross-val error')
102 | pl.plot(max_depth_array, train_error, label='training error')
103 | 
104 | pl.legend()
105 | pl.xlabel('max depth')
106 | pl.ylabel('error')
107 | 
108 | # select the value of max_depth which led to the best results
109 | max_depth = max_depth_array[np.argmin(cv_error)]
110 | print "max_depth = %i" % max_depth
111 | 
112 | #------------------------------------------------------------
113 | # plot the Decision Tree error as a function of number of samples
114 | #
115 | #  This is the second main part of the exercise.  Here you'll plot the
116 | #  training error and cross-validation error as a function of the
117 | #  number of training samples.
118 | #
119 | #  You will create three arrays: n_samples_array, train_error, and cv_error.
120 | #  Use at least 40 different values of n_samples, and compute the training
121 | #  and cross-validation error associated with each of them.
122 | #
123 | #  Make sure that when computing the training error for each number of
124 | #  samples, you use the same samples that the model was trained on.
125 | 
126 | n_samples_array = []
127 | train_error = []
128 | cv_error = []
129 | 
130 | # TODO:  compute the arrays n_samples_array, train_error, and cv_error
131 | 
132 | pl.figure()
133 | pl.plot(n_samples_array, cv_error, label='cross-val error')
134 | pl.plot(n_samples_array, train_error, label='training error')
135 | 
136 | pl.legend()
137 | pl.xlabel('number of samples')
138 | pl.ylabel('error')
139 | 
140 | #----------------------------------------------------------------------
141 | # Use the whole dataset:
142 | #  If you have been running your code on only a part of the dataset,
143 | #  now that you have it working, you can run it on the full dataset
144 | #  (note: this will take a long time to execute!)  You can do this by
145 | #  commenting out the line
146 | #     data = data[:50000]
147 | #  above.  How does this change the results?
148 | 
149 | 
150 | #------------------------------------------------------------
151 | # Catastrophic Outliers
152 | #  Though the rms error is one useful measure of the performance of an
153 | #  algorithm, astronomers are often more interested in reducing the
154 | #  'catastrophic outlier' rate.  Catastrophic outliers are points which
155 | #  are given redshifts very far from the true value.  For accuracy of
156 | #  cosmological results, this is often more important than the overall
157 | #  rms error.
158 | #
159 | #  Here, you can re-implement te above tasks, plotting the catastrophic
160 | #  outlier rate as a function of the max_depth parameter, and as a function
161 | #  of the number of training points.  This can be accomplished either by
162 | #  copying and pasting the above code here, or by modifying the above code.
163 | #
164 | #  To compute the catastrophic error rate, you can use the function
165 | #  compute_outlier_fraction()
166 | 
167 | # TODO:  repeat the above two plots using catastrophic error rate
168 | 
169 | #----------------------------------------------------------------------
170 | # Analyze the results
171 | #
172 | #  Compare your results to the discussion of bias and variance in section
173 | #  3.  How do you think these results could be improved?  Is it better to
174 | #  spend telescope time increasing the size of the training set, or would
175 | #  it be better to measure more features of the objects we already have?
176 | #  Does this recommendation change if the astronomer is interested in
177 | #  minimizing the number of catastrophic outliers rather than the rms error?
178 | 
179 | pl.show()
180 | 


--------------------------------------------------------------------------------
/doc/skeletons/exercise_03.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Astronomy Tutorial: exercise 3
  3 | 
  4 | Dimensionality reduction of stellar spectra
  5 | 
  6 | Usage: python exercise_03.py datadir [-m method] [-k n_neigbors]
  7 |                                      [-n norm_type] [-N n_samples]
  8 |                                      [-s]
  9 | 
 10 |   - datadir is $TUTORIAL_DIR/data/sdss_photoz
 11 |     This directory should contain the file sdss_photoz.npy
 12 | 
 13 |   - method is one of [pca | lle | mlle | isomap].  If not specified,
 14 |     PCA will be performed
 15 |     
 16 |   - n_neighbors is an integer number of neighbors to use with manifold methods
 17 | 
 18 |   - norm_type is one of [none | l1 | l2].  It specifies how the data should
 19 |     be normalized.
 20 | 
 21 |   - n_samples is the number of samples used for the projection.  Only 1000
 22 |     of the 4000 samples are used by default.
 23 | 
 24 |   - specifying -s shuffles the data.  This can help test for stability of
 25 |     the reconstruction.
 26 | 
 27 | Description:
 28 | In this tutorial, we explore manifold learning techniques to visualize 4000
 29 | SDSS spectral data.  This is a much more exploratory exercise than the previous
 30 | two.  The goal is to determine how to best visualize this high-dimensional
 31 | space.  You will implement PCA, LLE, Modified LLE, and Isomap, for various
 32 | data normalizations.  The goal is to find the best visualization of the
 33 | data, where "best" in this case is a qualitative measure of how well the
 34 | different classes of points are separated in the projected space.
 35 | 
 36 | To make experimentation more streamlined
 37 | 
 38 | There are several places in this file with code to be filled-in as part of
 39 | the exercise.  Each of these is labeled TODO below.
 40 | """
 41 | 
 42 | import os, sys
 43 | import numpy as np
 44 | 
 45 | import pylab as pl
 46 | from matplotlib import ticker
 47 | 
 48 | from sklearn import preprocessing
 49 | from sklearn.decomposition import RandomizedPCA
 50 | from sklearn.manifold import LocallyLinearEmbedding, Isomap
 51 | 
 52 | #----------------------------------------------------------------------
 53 | # set up command-line option parser
 54 | from optparse import OptionParser
 55 | parser = OptionParser(usage=__doc__,
 56 |                       version="%prog 1.0")
 57 | parser.add_option("-m", "--method",
 58 |                   dest="method",
 59 |                   default='pca',
 60 |                   help="Specify method to use: [pca | lle | mlle | isomap]")
 61 | 
 62 | parser.add_option("-k", "--neighbors",
 63 |                   dest="n_neighbors",
 64 |                   type="int",
 65 |                   default=15,
 66 |                   help='Specify number of neighbors for manifold learning')
 67 | 
 68 | parser.add_option("-N", "--normalization",
 69 |                   dest="normalization",
 70 |                   default="none",
 71 |                   help="Specify normalization: [none | l1 | l2]")
 72 | 
 73 | parser.add_option("-n", "--n_samples",
 74 |                   dest="n_samples",
 75 |                   type="int",
 76 |                   default=1000,
 77 |                   help="Specify number of samples to use, up to 4000 (default 1000)")
 78 | 
 79 | parser.add_option("-s", "--shuffle",
 80 |                   dest="shuffle",
 81 |                   action="store_true",
 82 |                   default=False,
 83 |                   help="shuffle the data")
 84 | 
 85 | 
 86 | options, args = parser.parse_args()
 87 | 
 88 | if len(args) == 0:
 89 |     parser.error("Must specify a data directory")
 90 | elif len(args) > 1:
 91 |     parser.error("Must specify a single data directory")
 92 | 
 93 | datadir = args[0]
 94 | 
 95 | print "data directory: %s" % datadir
 96 | print " method = %s" % options.method
 97 | print " n_neighbors = %i" % options.n_neighbors
 98 | print " normalization = %s" % options.normalization
 99 | print " n_samples: %i" % options.n_samples
100 | print " shuffle: %s" % options.shuffle
101 | 
102 | 
103 | def three_component_plot(c1, c2, c3, color, labels):
104 |     pl.figure(figsize=(8,8))
105 |     kwargs = dict(s=4, lw=0, c=color, vmin=2, vmax=6)
106 |     ax1 = pl.subplot(221)
107 |     pl.scatter(c1, c2, **kwargs)
108 |     pl.ylabel('component 2')
109 | 
110 |     ax2 = pl.subplot(223, sharex=ax1)
111 |     pl.scatter(c1, c3, **kwargs)
112 |     pl.xlabel('component 1')
113 |     pl.ylabel('component 3')
114 | 
115 |     ax3 = pl.subplot(224, sharey=ax2)
116 |     pl.scatter(c2, c3, **kwargs)
117 |     pl.xlabel('component 2')
118 | 
119 |     for ax in (ax1, ax2, ax3):
120 |         ax.xaxis.set_major_formatter(ticker.NullFormatter())
121 |         ax.yaxis.set_major_formatter(ticker.NullFormatter())
122 | 
123 |     pl.subplots_adjust(hspace=0.05, wspace=0.05)
124 | 
125 |     format = ticker.FuncFormatter(lambda i, *args: labels[i])
126 |     pl.colorbar(ticks = range(2, 7), format=format,
127 |                 cax = pl.axes((0.52, 0.51, 0.02, 0.39)))
128 |     pl.clim(1.5, 6.5)
129 | 
130 | 
131 | #----------------------------------------------------------------------
132 | # Load data files
133 | data = np.load(os.path.join(datadir, 'spec4000_corrected.npz'))
134 | 
135 | X = data['X']
136 | y = data['y']
137 | labels = data['labels']
138 | 
139 | if options.shuffle:
140 |     i = np.arange(y.shape[0], dtype=int)
141 |     np.random.shuffle(i)
142 |     X = X[i]
143 |     y = y[i]
144 | 
145 | #----------------------------------------------------------------------
146 | # truncate the data for experimentation
147 | #
148 | #  There are 4000 points, which can take a long time to run.  By default,
149 | #  it is truncated to 1000 samples.  This can be changed using the -n
150 | #  command-line argument.
151 | 
152 | X = X[:options.n_samples]
153 | y = y[:options.n_samples]
154 | 
155 | #----------------------------------------------------------------------
156 | # Normalization: 
157 | #  
158 | #  The results of the dimensionality reduction can depend heavily on the
159 | #  data normalization.  These can be commented or un-commented to try
160 | #  l1 or l2 normalization.
161 | 
162 | if options.normalization.lower() == 'none':
163 |     pass
164 | elif options.normalization.lower() == 'l2':
165 |     X = preprocessing.normalize(X, 'l2')
166 | elif options.normalization.lower() == 'l1':
167 |     X = preprocessing.normalize(X, 'l1')
168 | else:
169 |     raise ValueError("Unrecognized normalization: '%s'" % options.normalization)
170 | 
171 | #======================================================================
172 | # TODO: compute X_proj for each method.
173 | #   In each of the below cases, you should compute a projection of the
174 | #   data and store that projection in the matrix X_proj.
175 | #   X_proj should have the same number of rows as X, and should have
176 | #   at least 3 features.
177 | 
178 | X_proj = None
179 | 
180 | if options.method == 'pca':
181 |     print "Performing PCA"
182 |     # TODO:  compute a RandomizedPCA projection of X with n_components >= 3
183 | 
184 | elif options.method == 'lle':
185 |     print "Performing LLE"
186 |     # TODO:  compute LLE on X with method='standard', and out_dim >= 3
187 | 
188 | 
189 | elif options.method == 'mlle':
190 |     print "Performing MLLE"
191 |     # TODO:  compute LLE on X with method='modified' and out_dim >= 3
192 | 
193 | elif options.method == 'isomap':
194 |     print "Performing Isomap"
195 |     # TODO:  compute Isomap on X with out_dim >= 3
196 | 
197 | else:
198 |     raise ValueError("Unrecognized method: '%s'" % options.method)
199 | 
200 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels)
201 | pl.show()
202 | 
203 | 


--------------------------------------------------------------------------------
/doc/solutions/exercise_01.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Astronomy Tutorial: exercise 1
  3 | 
  4 | Classification of photometric sources
  5 | 
  6 | usage: python exercise_01.py datadir
  7 | 
  8 |   - datadir is $TUTORIAL_DIR/data/sdss_colors
  9 |     This directory should contain the files:
 10 |        - sdssdr6_colors_class_train.npy
 11 |        - sdssdr6_colors_class.200000.npy
 12 | 
 13 | Description:
 14 | In the tutorial, we used a Naive Bayes Classifier to separate Quasars
 15 | And Stars.  In this exercise, we will extend this classification scheme
 16 | using Gaussian Mixture Models.
 17 | 
 18 | The Gaussian Naive Bayes method starts by fitting an N-dimensional gaussian
 19 | distribution to each class of data.  When a test point is evaluated, the
 20 | relative log-likelihood from each distribution is used to predict the most
 21 | likely value.  We're going to extend this by fitting a sum of gaussians to
 22 | each distribution.
 23 | 
 24 | There are several places in this file with code to be filled-in as part of
 25 | the exercise.  Each of these is labeled TODO below.
 26 | """
 27 | import os, sys
 28 | import numpy as np
 29 | import pylab as pl
 30 | from sklearn.mixture import gmm
 31 | from sklearn import metrics
 32 | 
 33 | try:
 34 |     datadir = sys.argv[1]
 35 | except:
 36 |     print __doc__
 37 |     sys.exit()
 38 | 
 39 | #----------------------------------------------------------------------
 40 | # Load data files
 41 | train_data = np.load(os.path.join(datadir,
 42 |                                   'sdssdr6_colors_class_train.npy'))
 43 | test_data = np.load(os.path.join(datadir,
 44 |                                  'sdssdr6_colors_class.200000.npy'))
 45 | 
 46 | # set the number of training points: using all points leads to a very
 47 | # long running time.  We'll start with 10000 training points.  This
 48 | # can be increased if desired.
 49 | Ntrain = 10000
 50 | #Ntrain = len(train_data)
 51 | 
 52 | np.random.seed(0)
 53 | np.random.shuffle(train_data)
 54 | train_data = train_data[:Ntrain]
 55 | 
 56 | #----------------------------------------------------------------------
 57 | # Split training data into training and cross-validation sets
 58 | N_crossval = Ntrain / 5
 59 | train_data = train_data[:-N_crossval]
 60 | crossval_data = train_data[-N_crossval:]
 61 | 
 62 | #----------------------------------------------------------------------
 63 | # Set up data
 64 | #
 65 | X_train = np.zeros((train_data.size, 4), dtype=float)
 66 | X_train[:, 0] = train_data['u-g']
 67 | X_train[:, 1] = train_data['g-r']
 68 | X_train[:, 2] = train_data['r-i']
 69 | X_train[:, 3] = train_data['i-z']
 70 | y_train = (train_data['redshift'] > 0).astype(int)
 71 | Ntrain = len(y_train)
 72 | 
 73 | X_crossval = np.zeros((crossval_data.size, 4), dtype=float)
 74 | X_crossval[:, 0] = crossval_data['u-g']
 75 | X_crossval[:, 1] = crossval_data['g-r']
 76 | X_crossval[:, 2] = crossval_data['r-i']
 77 | X_crossval[:, 3] = crossval_data['i-z']
 78 | y_crossval = (crossval_data['redshift'] > 0).astype(int)
 79 | Ncrossval = len(y_crossval)
 80 | 
 81 | #======================================================================
 82 | # Recreating Gaussian Naive Bayes
 83 | #
 84 | #   Here we will use Gaussian Mixture Models to duplicate our Gaussian
 85 | #   Naive Bayes results from earlier.  You'll create two sklearn.gmm.GMM()
 86 | #   classifier instances, named `clf_0` and `clf_1`.  Each should be
 87 | #   initialized with a single component, and diagonal covariance.
 88 | #   (hint: look at the doc string for sklearn.gmm.GMM to see how to set
 89 | #   this up).  The results should be compared to Gaussian Naive Bayes
 90 | #   to check if they're correct.
 91 | #
 92 | #   Objects to create:
 93 | #    - clf_0 : trained on the portion of the training data with y == 0
 94 | #    - clf_1 : trained on the portion of the training data with y == 1
 95 | 
 96 | #{{{ compute clf_0, clf_1
 97 | clf_0 = gmm.GMM(1, 'diag')
 98 | i0 = (y_train == 0)
 99 | clf_0.fit(X_train[i0])
100 | 
101 | clf_1 = gmm.GMM(1, 'diag')
102 | i1 = (y_train == 1)
103 | clf_1.fit(X_train[i1])
104 | #}}}
105 | 
106 | # next we must construct the prior.  The prior is the fraction of training
107 | # points of each type.
108 | # 
109 | # variables to compute:
110 | #  - prior0 : fraction of training points with y == 0
111 | #  - prior1 : fraction of training points with y == 1
112 | 
113 | #{{{ compute prior0, prior1
114 | num0 = i0.sum()
115 | num1 = i1.sum()
116 | 
117 | prior0 = num0 / float(Ntrain)
118 | prior1 = num1 / float(Ntrain)
119 | #}}}
120 | 
121 | # Now we use the prior and the classifiation to compute the log-likelihoods
122 | #  of the cross-validation points.  The log likelihood is given by
123 | #
124 | #    logL(x) = clf.score(x) + log(prior)
125 | #
126 | #  You can use the function np.log() to compute the logarithm of the prior.
127 | #  variables to compute:
128 | #    logL : array, shape = (2, Ncrossval)
129 | #            logL[0] is the log-likelihood for y == 0
130 | #            logL[1] is the log-likelihood for y == 1
131 | logL = None
132 | 
133 | #{{{ compute logL
134 | logL = np.zeros((2, Ncrossval))
135 | logL[0] = clf_0.score(X_crossval) + np.log(prior0)
136 | logL[1] = clf_1.score(X_crossval) + np.log(prior1)
137 | #}}}
138 | 
139 | # the predicted value for each sample is the index with the largest
140 | # log-likelihood.
141 | y_pred = np.argmax(logL, 0)
142 | 
143 | # now we print the results.  We'll use the built-in classification
144 | # report function in sklearn.metrics.  This computes the precision,
145 | # recall, and f1-score for each class.
146 | 
147 | print "------------------------------------------------------------"
148 | print "One-component Gaussian Mixture:"
149 | print "  results for cross-validation set:"
150 | print metrics.classification_report(y_crossval, y_pred,
151 |                                     target_names=['stars', 'QSOs'])
152 | 
153 | 
154 | 
155 | #----------------------------------------------------------------------
156 | #  Run Gaussian Naive Bayes to double-check that our results are correct.
157 | #  Because of rounding errors, it will not be exact, but the results should
158 | #  be very close.
159 | from sklearn.naive_bayes import GaussianNB
160 | gnb = GaussianNB()
161 | gnb.fit(X_train, y_train)
162 | y_pred = gnb.predict(X_crossval)
163 | 
164 | print "------------------------------------------------------------"
165 | print "Gaussian Naive Bayes"
166 | print "  results for cross-validation set:"
167 | print "  (results should be within ~0.01 of above results)"
168 | print metrics.classification_report(y_crossval, y_pred,
169 |                                     target_names=['stars', 'QSOs'])
170 | 
171 | #======================================================================
172 | #  Parameter optimization:
173 | #
174 | #   Now take some time to experiment with the covariance type and the
175 | #   number of components, to see if you can optimize the F1 score
176 | #
177 | #   Note that for a large number of components, the fit can take a long
178 | #   time, and will be dependent on the starting position.  Use the
179 | #   documentation string of GMM to determine the options for covariance.
180 | #
181 | #   It may be helpful to use only a subset of the training data while
182 | #   experimenting with these parameter values.  This is called
183 | #   "Meta-parameter optimization".  It can be accomplished automatically,
184 | #   but here we are doing it by hand for learning purposes.
185 | y_pred = None
186 | 
187 | #{{{ compute y_pred for cross-validation data
188 | clf_0 = gmm.GMM(5, 'full', random_state=0)
189 | i0 = (y_train == 0)
190 | clf_0.fit(X_train[i0])
191 | 
192 | clf_1 = gmm.GMM(5, 'full', random_state=0)
193 | i1 = (y_train == 1)
194 | clf_1.fit(X_train[i1])
195 | 
196 | logL = np.zeros((2, Ncrossval))
197 | logL[0] = clf_0.score(X_crossval) + np.log(prior0)
198 | logL[1] = clf_1.score(X_crossval) + np.log(prior1)
199 | 
200 | y_pred = np.argmax(logL, 0)
201 | #}}}
202 | 
203 | print "------------------------------------------------------------"
204 | print "GMM with tweaked parameters:"
205 | print "  results for cross-validation set"
206 | print metrics.classification_report(y_crossval, y_pred,
207 |                                     target_names=['stars', 'QSOs'])
208 | 
209 | #----------------------------------------------------------------------
210 | # Test Data
211 | # once you have maximized the cross-validation, you can apply the estimator
212 | # to your test data, and check how it compares to the predicted results
213 | # from the researcher who compiled it.
214 | 
215 | X_test = np.zeros((test_data.size, 4), dtype=float)
216 | X_test[:, 0] = test_data['u-g']
217 | X_test[:, 1] = test_data['g-r']
218 | X_test[:, 2] = test_data['r-i']
219 | X_test[:, 3] = test_data['i-z']
220 | y_pred_literature = (test_data['label'] == 0).astype(int)
221 | Ntest = len(y_pred_literature)
222 | 
223 | # here you should compute y_pred for the test data, using the classifiers
224 | # clf_0 and clf_1 which you already trained above.
225 | 
226 | y_pred = None
227 | 
228 | #{{{ compute y_pred for test data
229 | logL = np.zeros((2, Ntest))
230 | logL[0] = clf_0.score(X_test) + np.log(prior0)
231 | logL[1] = clf_1.score(X_test) + np.log(prior1)
232 | y_pred = np.argmax(logL, 0)
233 | #}}}
234 | 
235 | print "------------------------------------------------------------"
236 | print "Comparison of current results with published results"
237 | print "  results for test set"
238 | print "    (treating published results as the 'true' result)"
239 | print metrics.classification_report(y_pred_literature, y_pred,
240 |                                     target_names=['stars', 'QSOs'])
241 | 


--------------------------------------------------------------------------------
/doc/solutions/exercise_03.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Astronomy Tutorial: exercise 3
  3 | 
  4 | Dimensionality reduction of stellar spectra
  5 | 
  6 | Usage: python exercise_03.py datadir [-m method] [-k n_neigbors]
  7 |                                      [-n norm_type] [-N n_samples]
  8 |                                      [-s]
  9 | 
 10 |   - datadir is $TUTORIAL_DIR/data/sdss_photoz
 11 |     This directory should contain the file sdss_photoz.npy
 12 | 
 13 |   - method is one of [pca | lle | mlle | isomap].  If not specified,
 14 |     PCA will be performed
 15 |     
 16 |   - n_neighbors is an integer number of neighbors to use with manifold methods
 17 | 
 18 |   - norm_type is one of [none | l1 | l2].  It specifies how the data should
 19 |     be normalized.
 20 | 
 21 |   - n_samples is the number of samples used for the projection.  Only 1000
 22 |     of the 4000 samples are used by default.
 23 | 
 24 |   - specifying -s shuffles the data.  This can help test for stability of
 25 |     the reconstruction.
 26 | 
 27 | Description:
 28 | In this tutorial, we explore manifold learning techniques to visualize 4000
 29 | SDSS spectral data.  This is a much more exploratory exercise than the previous
 30 | two.  The goal is to determine how to best visualize this high-dimensional
 31 | space.  You will implement PCA, LLE, Modified LLE, and Isomap, for various
 32 | data normalizations.  The goal is to find the best visualization of the
 33 | data, where "best" in this case is a qualitative measure of how well the
 34 | different classes of points are separated in the projected space.
 35 | 
 36 | To make experimentation more streamlined
 37 | 
 38 | There are several places in this file with code to be filled-in as part of
 39 | the exercise.  Each of these is labeled TODO below.
 40 | """
 41 | 
 42 | import os, sys
 43 | import numpy as np
 44 | 
 45 | import pylab as pl
 46 | from matplotlib import ticker
 47 | 
 48 | from sklearn import preprocessing
 49 | from sklearn.decomposition import RandomizedPCA
 50 | from sklearn.manifold import LocallyLinearEmbedding, Isomap
 51 | 
 52 | #----------------------------------------------------------------------
 53 | # set up command-line option parser
 54 | from optparse import OptionParser
 55 | parser = OptionParser(usage=__doc__,
 56 |                       version="%prog 1.0")
 57 | parser.add_option("-m", "--method",
 58 |                   dest="method",
 59 |                   default='pca',
 60 |                   help="Specify method to use: [pca | lle | mlle | isomap]")
 61 | 
 62 | parser.add_option("-k", "--neighbors",
 63 |                   dest="n_neighbors",
 64 |                   type="int",
 65 |                   default=15,
 66 |                   help='Specify number of neighbors for manifold learning')
 67 | 
 68 | parser.add_option("-N", "--normalization",
 69 |                   dest="normalization",
 70 |                   default="none",
 71 |                   help="Specify normalization: [none | l1 | l2]")
 72 | 
 73 | parser.add_option("-n", "--n_samples",
 74 |                   dest="n_samples",
 75 |                   type="int",
 76 |                   default=1000,
 77 |                   help="Specify number of samples to use, up to 4000 (default 1000)")
 78 | 
 79 | parser.add_option("-s", "--shuffle",
 80 |                   dest="shuffle",
 81 |                   action="store_true",
 82 |                   default=False,
 83 |                   help="shuffle the data")
 84 | 
 85 | 
 86 | options, args = parser.parse_args()
 87 | 
 88 | if len(args) == 0:
 89 |     parser.error("Must specify a data directory")
 90 | elif len(args) > 1:
 91 |     parser.error("Must specify a single data directory")
 92 | 
 93 | datadir = args[0]
 94 | 
 95 | print "data directory: %s" % datadir
 96 | print " method = %s" % options.method
 97 | print " n_neighbors = %i" % options.n_neighbors
 98 | print " normalization = %s" % options.normalization
 99 | print " n_samples: %i" % options.n_samples
100 | print " shuffle: %s" % options.shuffle
101 | 
102 | 
103 | def three_component_plot(c1, c2, c3, color, labels):
104 |     pl.figure(figsize=(8,8))
105 |     kwargs = dict(s=4, lw=0, c=color, vmin=2, vmax=6)
106 |     ax1 = pl.subplot(221)
107 |     pl.scatter(c1, c2, **kwargs)
108 |     pl.ylabel('component 2')
109 | 
110 |     ax2 = pl.subplot(223, sharex=ax1)
111 |     pl.scatter(c1, c3, **kwargs)
112 |     pl.xlabel('component 1')
113 |     pl.ylabel('component 3')
114 | 
115 |     ax3 = pl.subplot(224, sharey=ax2)
116 |     pl.scatter(c2, c3, **kwargs)
117 |     pl.xlabel('component 2')
118 | 
119 |     for ax in (ax1, ax2, ax3):
120 |         ax.xaxis.set_major_formatter(ticker.NullFormatter())
121 |         ax.yaxis.set_major_formatter(ticker.NullFormatter())
122 | 
123 |     pl.subplots_adjust(hspace=0.05, wspace=0.05)
124 | 
125 |     format = ticker.FuncFormatter(lambda i, *args: labels[i])
126 |     pl.colorbar(ticks = range(2, 7), format=format,
127 |                 cax = pl.axes((0.52, 0.51, 0.02, 0.39)))
128 |     pl.clim(1.5, 6.5)
129 | 
130 | 
131 | #----------------------------------------------------------------------
132 | # Load data files
133 | data = np.load(os.path.join(datadir, 'spec4000_corrected.npz'))
134 | 
135 | X = data['X']
136 | y = data['y']
137 | labels = data['labels']
138 | 
139 | if options.shuffle:
140 |     i = np.arange(y.shape[0], dtype=int)
141 |     np.random.shuffle(i)
142 |     X = X[i]
143 |     y = y[i]
144 | 
145 | #----------------------------------------------------------------------
146 | # truncate the data for experimentation
147 | #
148 | #  There are 4000 points, which can take a long time to run.  By default,
149 | #  it is truncated to 1000 samples.  This can be changed using the -n
150 | #  command-line argument.
151 | 
152 | X = X[:options.n_samples]
153 | y = y[:options.n_samples]
154 | 
155 | #----------------------------------------------------------------------
156 | # Normalization: 
157 | #  
158 | #  The results of the dimensionality reduction can depend heavily on the
159 | #  data normalization.  These can be commented or un-commented to try
160 | #  l1 or l2 normalization.
161 | 
162 | if options.normalization.lower() == 'none':
163 |     pass
164 | elif options.normalization.lower() == 'l2':
165 |     X = preprocessing.normalize(X, 'l2')
166 | elif options.normalization.lower() == 'l1':
167 |     X = preprocessing.normalize(X, 'l1')
168 | else:
169 |     raise ValueError("Unrecognized normalization: '%s'" % options.normalization)
170 | 
171 | #======================================================================
172 | # TODO: compute X_proj for each method.
173 | #   In each of the below cases, you should compute a projection of the
174 | #   data and store that projection in the matrix X_proj.
175 | #   X_proj should have the same number of rows as X, and should have
176 | #   at least 3 features.
177 | 
178 | X_proj = None
179 | 
180 | if options.method == 'pca':
181 |     print "Performing PCA"
182 |     #{{{ compute a RandomizedPCA projection of X with n_components >= 3
183 |     rpca = RandomizedPCA(n_components=3, random_state=0)
184 |     X_proj = rpca.fit_transform(X)
185 |     #}}}
186 | 
187 | elif options.method == 'lle':
188 |     print "Performing LLE"
189 |     #{{{ compute LLE on X with method='standard', and out_dim >= 3
190 |     lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors,
191 |                                  out_dim=3, method='standard')
192 |     X_proj = lle.fit_transform(X)
193 |     #}}}
194 | 
195 | 
196 | elif options.method == 'mlle':
197 |     print "Performing MLLE"
198 |     #{{{ compute LLE on X with method='modified' and out_dim >= 3
199 |     lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors,
200 |                                  out_dim=3, method='modified')
201 |     X_proj = lle.fit_transform(X)
202 |     #}}}
203 | 
204 | elif options.method == 'isomap':
205 |     print "Performing Isomap"
206 |     #{{{ compute Isomap on X with out_dim >= 3
207 |     iso = Isomap(n_neighbors=options.n_neighbors,
208 |                  out_dim=3)
209 |     X_proj = iso.fit_transform(X)
210 |     #}}}
211 | 
212 | else:
213 |     raise ValueError("Unrecognized method: '%s'" % options.method)
214 | 
215 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels)
216 | pl.show()
217 | 
218 | 


--------------------------------------------------------------------------------
/doc/solutions/generate_skeletons.py:
--------------------------------------------------------------------------------
 1 | """Generate skeletons from the example code"""
 2 | import os
 3 | 
 4 | exercise_dir = os.path.dirname(__file__)
 5 | if exercise_dir == '':
 6 |     exercise_dir = '.'
 7 | 
 8 | skeleton_dir = os.path.abspath(os.path.join(exercise_dir,'../skeletons/'))
 9 | if not os.path.exists(skeleton_dir):
10 |     os.makedirs(skeleton_dir)
11 | 
12 | L = os.listdir(exercise_dir)
13 | 
14 | for f in L:
15 |     if not f.endswith('.py'):
16 |         continue
17 |     
18 |     if f == os.path.basename(__file__):
19 |         continue
20 | 
21 |     print "parsing %s" % f
22 | 
23 |     input_file = open(os.path.join(exercise_dir, f))
24 |     output_file = open(os.path.join(skeleton_dir, f), 'w')
25 | 
26 |     in_exercise_region = False
27 |     
28 |     for line in input_file:
29 |         linestrip = line.strip()
30 |         if linestrip.startswith('#{{{'):
31 |             in_exercise_region = True
32 |             message = linestrip.lstrip('#{{{')
33 |             output_file.write(line.split('#')[0] + '# TODO: %s\n' % message)
34 |         elif in_exercise_region:
35 |             if '#}}}' in line:
36 |                 in_exercise_region = False
37 |         else:
38 |             output_file.write(line)
39 | 
40 |     output_file.close()
41 | 


--------------------------------------------------------------------------------
/doc/sphinxext/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | -------------------------------------------------------------------------------
 2 |     The files
 3 |     - numpydoc.py
 4 |     - autosummary.py
 5 |     - autosummary_generate.py
 6 |     - docscrape.py
 7 |     - docscrape_sphinx.py
 8 |     - phantom_import.py
 9 |     have the following license:
10 | 
11 | Copyright (C) 2008 Stefan van der Walt <stefan@mentat.za.net>, Pauli Virtanen <pav@iki.fi>
12 | 
13 | Redistribution and use in source and binary forms, with or without
14 | modification, are permitted provided that the following conditions are
15 | met:
16 | 
17 |  1. Redistributions of source code must retain the above copyright
18 |     notice, this list of conditions and the following disclaimer.
19 |  2. Redistributions in binary form must reproduce the above copyright
20 |     notice, this list of conditions and the following disclaimer in
21 |     the documentation and/or other materials provided with the
22 |     distribution.
23 | 
24 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
28 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 | POSSIBILITY OF SUCH DAMAGE.
35 | 
36 | -------------------------------------------------------------------------------
37 |     The files
38 |     - compiler_unparse.py
39 |     - comment_eater.py
40 |     - traitsdoc.py
41 |     have the following license:
42 | 
43 | This software is OSI Certified Open Source Software.
44 | OSI Certified is a certification mark of the Open Source Initiative.
45 | 
46 | Copyright (c) 2006, Enthought, Inc.
47 | All rights reserved.
48 | 
49 | Redistribution and use in source and binary forms, with or without
50 | modification, are permitted provided that the following conditions are met:
51 | 
52 |  * Redistributions of source code must retain the above copyright notice, this
53 |    list of conditions and the following disclaimer.
54 |  * Redistributions in binary form must reproduce the above copyright notice,
55 |    this list of conditions and the following disclaimer in the documentation
56 |    and/or other materials provided with the distribution.
57 |  * Neither the name of Enthought, Inc. nor the names of its contributors may
58 |    be used to endorse or promote products derived from this software without
59 |    specific prior written permission.
60 | 
61 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
62 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
63 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
64 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
65 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
66 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
67 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
68 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
69 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
70 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
71 | 
72 | 
73 | -------------------------------------------------------------------------------
74 |     The files
75 |     - only_directives.py
76 |     - plot_directive.py
77 |     originate from Matplotlib (http://matplotlib.sf.net/) which has
78 |     the following license:
79 | 
80 | Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved.
81 | 
82 | 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation.
83 | 
84 | 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee.
85 | 
86 | 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3.
87 | 
88 | 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
89 | 
90 | 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
91 | 
92 | 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions.
93 | 
94 | 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party.
95 | 
96 | 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement.
97 | 
98 | 


--------------------------------------------------------------------------------
/doc/sphinxext/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *.py
2 | include *.txt
3 | 


--------------------------------------------------------------------------------
/doc/sphinxext/README.txt:
--------------------------------------------------------------------------------
 1 | =====================================
 2 | numpydoc -- Numpy's Sphinx extensions
 3 | =====================================
 4 | 
 5 | Numpy's documentation uses several custom extensions to Sphinx.  These
 6 | are shipped in this ``numpydoc`` package, in case you want to make use
 7 | of them in third-party projects.
 8 | 
 9 | The following extensions are available:
10 | 
11 |   - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add
12 |     the code description directives ``np-function``, ``np-cfunction``, etc.
13 |     that support the Numpy docstring syntax.
14 | 
15 |   - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes.
16 | 
17 |   - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::``
18 |     directive. Note that this implementation may still undergo severe
19 |     changes or eventually be deprecated.
20 | 
21 |   - ``numpydoc.only_directives``: (DEPRECATED)
22 | 
23 |   - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive.
24 |     Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``,
25 |     and it the Sphinx 1.0 version is recommended over that included in
26 |     Numpydoc.
27 | 
28 | 
29 | numpydoc
30 | ========
31 | 
32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings
33 | following the Numpy/Scipy format to a form palatable to Sphinx.
34 | 
35 | Options
36 | -------
37 | 
38 | The following options can be set in conf.py:
39 | 
40 | - numpydoc_use_plots: bool
41 | 
42 |   Whether to produce ``plot::`` directives for Examples sections that
43 |   contain ``import matplotlib``.
44 | 
45 | - numpydoc_show_class_members: bool
46 | 
47 |   Whether to show all members of a class in the Methods and Attributes
48 |   sections automatically.
49 | 
50 | - numpydoc_edit_link: bool  (DEPRECATED -- edit your HTML template instead)
51 | 
52 |   Whether to insert an edit link after docstrings.
53 | 


--------------------------------------------------------------------------------
/doc/sphinxext/numpy_ext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/sphinxext/numpy_ext/__init__.py


--------------------------------------------------------------------------------
/doc/sphinxext/numpy_ext/docscrape_sphinx.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import inspect
  3 | import textwrap
  4 | import pydoc
  5 | import sphinx
  6 | from docscrape import NumpyDocString
  7 | from docscrape import FunctionDoc
  8 | from docscrape import ClassDoc
  9 | 
 10 | 
 11 | class SphinxDocString(NumpyDocString):
 12 |     def __init__(self, docstring, config=None):
 13 |         config = {} if config is None else config
 14 |         self.use_plots = config.get('use_plots', False)
 15 |         NumpyDocString.__init__(self, docstring, config=config)
 16 | 
 17 |     # string conversion routines
 18 |     def _str_header(self, name, symbol='`'):
 19 |         return ['.. rubric:: ' + name, '']
 20 | 
 21 |     def _str_field_list(self, name):
 22 |         return [':' + name + ':']
 23 | 
 24 |     def _str_indent(self, doc, indent=4):
 25 |         out = []
 26 |         for line in doc:
 27 |             out += [' ' * indent + line]
 28 |         return out
 29 | 
 30 |     def _str_signature(self):
 31 |         return ['']
 32 |         if self['Signature']:
 33 |             return ['``%s``' % self['Signature']] + ['']
 34 |         else:
 35 |             return ['']
 36 | 
 37 |     def _str_summary(self):
 38 |         return self['Summary'] + ['']
 39 | 
 40 |     def _str_extended_summary(self):
 41 |         return self['Extended Summary'] + ['']
 42 | 
 43 |     def _str_param_list(self, name):
 44 |         out = []
 45 |         if self[name]:
 46 |             out += self._str_field_list(name)
 47 |             out += ['']
 48 |             for param, param_type, desc in self[name]:
 49 |                 out += self._str_indent(['**%s** : %s' % (param.strip(),
 50 |                                                           param_type)])
 51 |                 out += ['']
 52 |                 out += self._str_indent(desc, 8)
 53 |                 out += ['']
 54 |         return out
 55 | 
 56 |     @property
 57 |     def _obj(self):
 58 |         if hasattr(self, '_cls'):
 59 |             return self._cls
 60 |         elif hasattr(self, '_f'):
 61 |             return self._f
 62 |         return None
 63 | 
 64 |     def _str_member_list(self, name):
 65 |         """
 66 |         Generate a member listing, autosummary:: table where possible,
 67 |         and a table where not.
 68 | 
 69 |         """
 70 |         out = []
 71 |         if self[name]:
 72 |             out += ['.. rubric:: %s' % name, '']
 73 |             prefix = getattr(self, '_name', '')
 74 | 
 75 |             if prefix:
 76 |                 prefix = '~%s.' % prefix
 77 | 
 78 |             autosum = []
 79 |             others = []
 80 |             for param, param_type, desc in self[name]:
 81 |                 param = param.strip()
 82 |                 if not self._obj or hasattr(self._obj, param):
 83 |                     autosum += ["   %s%s" % (prefix, param)]
 84 |                 else:
 85 |                     others.append((param, param_type, desc))
 86 | 
 87 |             if autosum:
 88 |                 # GAEL: Toctree commented out below because it creates
 89 |                 # hundreds of sphinx warnings
 90 |                 # out += ['.. autosummary::', '   :toctree:', '']
 91 |                 out += ['.. autosummary::', '']
 92 |                 out += autosum
 93 | 
 94 |             if others:
 95 |                 maxlen_0 = max([len(x[0]) for x in others])
 96 |                 maxlen_1 = max([len(x[1]) for x in others])
 97 |                 hdr = "=" * maxlen_0 + "  " + "=" * maxlen_1 + "  " + "=" * 10
 98 |                 fmt = '%%%ds  %%%ds  ' % (maxlen_0, maxlen_1)
 99 |                 n_indent = maxlen_0 + maxlen_1 + 4
100 |                 out += [hdr]
101 |                 for param, param_type, desc in others:
102 |                     out += [fmt % (param.strip(), param_type)]
103 |                     out += self._str_indent(desc, n_indent)
104 |                 out += [hdr]
105 |             out += ['']
106 |         return out
107 | 
108 |     def _str_section(self, name):
109 |         out = []
110 |         if self[name]:
111 |             out += self._str_header(name)
112 |             out += ['']
113 |             content = textwrap.dedent("\n".join(self[name])).split("\n")
114 |             out += content
115 |             out += ['']
116 |         return out
117 | 
118 |     def _str_see_also(self, func_role):
119 |         out = []
120 |         if self['See Also']:
121 |             see_also = super(SphinxDocString, self)._str_see_also(func_role)
122 |             out = ['.. seealso::', '']
123 |             out += self._str_indent(see_also[2:])
124 |         return out
125 | 
126 |     def _str_warnings(self):
127 |         out = []
128 |         if self['Warnings']:
129 |             out = ['.. warning::', '']
130 |             out += self._str_indent(self['Warnings'])
131 |         return out
132 | 
133 |     def _str_index(self):
134 |         idx = self['index']
135 |         out = []
136 |         if len(idx) == 0:
137 |             return out
138 | 
139 |         out += ['.. index:: %s' % idx.get('default', '')]
140 |         for section, references in idx.iteritems():
141 |             if section == 'default':
142 |                 continue
143 |             elif section == 'refguide':
144 |                 out += ['   single: %s' % (', '.join(references))]
145 |             else:
146 |                 out += ['   %s: %s' % (section, ','.join(references))]
147 |         return out
148 | 
149 |     def _str_references(self):
150 |         out = []
151 |         if self['References']:
152 |             out += self._str_header('References')
153 |             if isinstance(self['References'], str):
154 |                 self['References'] = [self['References']]
155 |             out.extend(self['References'])
156 |             out += ['']
157 |             # Latex collects all references to a separate bibliography,
158 |             # so we need to insert links to it
159 |             if sphinx.__version__ >= "0.6":
160 |                 out += ['.. only:: latex', '']
161 |             else:
162 |                 out += ['.. latexonly::', '']
163 |             items = []
164 |             for line in self['References']:
165 |                 m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I)
166 |                 if m:
167 |                     items.append(m.group(1))
168 |             out += ['   ' + ", ".join(["[%s]_" % item for item in items]), '']
169 |         return out
170 | 
171 |     def _str_examples(self):
172 |         examples_str = "\n".join(self['Examples'])
173 | 
174 |         if (self.use_plots and 'import matplotlib' in examples_str
175 |                 and 'plot::' not in examples_str):
176 |             out = []
177 |             out += self._str_header('Examples')
178 |             out += ['.. plot::', '']
179 |             out += self._str_indent(self['Examples'])
180 |             out += ['']
181 |             return out
182 |         else:
183 |             return self._str_section('Examples')
184 | 
185 |     def __str__(self, indent=0, func_role="obj"):
186 |         out = []
187 |         out += self._str_signature()
188 |         out += self._str_index() + ['']
189 |         out += self._str_summary()
190 |         out += self._str_extended_summary()
191 |         for param_list in ('Parameters', 'Returns', 'Raises'):
192 |             out += self._str_param_list(param_list)
193 |         out += self._str_warnings()
194 |         out += self._str_see_also(func_role)
195 |         out += self._str_section('Notes')
196 |         out += self._str_references()
197 |         out += self._str_examples()
198 |         for param_list in ('Attributes', 'Methods'):
199 |             out += self._str_member_list(param_list)
200 |         out = self._str_indent(out, indent)
201 |         return '\n'.join(out)
202 | 
203 | 
204 | class SphinxFunctionDoc(SphinxDocString, FunctionDoc):
205 |     def __init__(self, obj, doc=None, config={}):
206 |         self.use_plots = config.get('use_plots', False)
207 |         FunctionDoc.__init__(self, obj, doc=doc, config=config)
208 | 
209 | 
210 | class SphinxClassDoc(SphinxDocString, ClassDoc):
211 |     def __init__(self, obj, doc=None, func_doc=None, config={}):
212 |         self.use_plots = config.get('use_plots', False)
213 |         ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config)
214 | 
215 | 
216 | class SphinxObjDoc(SphinxDocString):
217 |     def __init__(self, obj, doc=None, config=None):
218 |         self._f = obj
219 |         SphinxDocString.__init__(self, doc, config=config)
220 | 
221 | 
222 | def get_doc_object(obj, what=None, doc=None, config={}):
223 |     if what is None:
224 |         if inspect.isclass(obj):
225 |             what = 'class'
226 |         elif inspect.ismodule(obj):
227 |             what = 'module'
228 |         elif callable(obj):
229 |             what = 'function'
230 |         else:
231 |             what = 'object'
232 |     if what == 'class':
233 |         return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc,
234 |                               config=config)
235 |     elif what in ('function', 'method'):
236 |         return SphinxFunctionDoc(obj, doc=doc, config=config)
237 |     else:
238 |         if doc is None:
239 |             doc = pydoc.getdoc(obj)
240 |         return SphinxObjDoc(obj, doc, config=config)
241 | 


--------------------------------------------------------------------------------
/doc/sphinxext/numpy_ext/numpydoc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ========
  3 | numpydoc
  4 | ========
  5 | 
  6 | Sphinx extension that handles docstrings in the Numpy standard format. [1]
  7 | 
  8 | It will:
  9 | 
 10 | - Convert Parameters etc. sections to field lists.
 11 | - Convert See Also section to a See also entry.
 12 | - Renumber references.
 13 | - Extract the signature from the docstring, if it can't be determined
 14 |   otherwise.
 15 | 
 16 | .. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard
 17 | 
 18 | """
 19 | 
 20 | import os
 21 | import re
 22 | import pydoc
 23 | from docscrape_sphinx import get_doc_object
 24 | from docscrape_sphinx import SphinxDocString
 25 | from sphinx.util.compat import Directive
 26 | import inspect
 27 | 
 28 | 
 29 | def mangle_docstrings(app, what, name, obj, options, lines,
 30 |                       reference_offset=[0]):
 31 | 
 32 |     cfg = dict(use_plots=app.config.numpydoc_use_plots,
 33 |                show_class_members=app.config.numpydoc_show_class_members)
 34 | 
 35 |     if what == 'module':
 36 |         # Strip top title
 37 |         title_re = re.compile(ur'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*',
 38 |                               re.I | re.S)
 39 |         lines[:] = title_re.sub(u'', u"\n".join(lines)).split(u"\n")
 40 |     else:
 41 |         doc = get_doc_object(obj, what, u"\n".join(lines), config=cfg)
 42 |         lines[:] = unicode(doc).split(u"\n")
 43 | 
 44 |     if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \
 45 |            obj.__name__:
 46 |         if hasattr(obj, '__module__'):
 47 |             v = dict(full_name=u"%s.%s" % (obj.__module__, obj.__name__))
 48 |         else:
 49 |             v = dict(full_name=obj.__name__)
 50 |         lines += [u'', u'.. htmlonly::', '']
 51 |         lines += [u'    %s' % x for x in
 52 |                   (app.config.numpydoc_edit_link % v).split("\n")]
 53 | 
 54 |     # replace reference numbers so that there are no duplicates
 55 |     references = []
 56 |     for line in lines:
 57 |         line = line.strip()
 58 |         m = re.match(ur'^.. \[([a-z0-9_.-])\]', line, re.I)
 59 |         if m:
 60 |             references.append(m.group(1))
 61 | 
 62 |     # start renaming from the longest string, to avoid overwriting parts
 63 |     references.sort(key=lambda x: -len(x))
 64 |     if references:
 65 |         for i, line in enumerate(lines):
 66 |             for r in references:
 67 |                 if re.match(ur'^\d+$', r):
 68 |                     new_r = u"R%d" % (reference_offset[0] + int(r))
 69 |                 else:
 70 |                     new_r = u"%s%d" % (r, reference_offset[0])
 71 |                 lines[i] = lines[i].replace(u'[%s]_' % r,
 72 |                                             u'[%s]_' % new_r)
 73 |                 lines[i] = lines[i].replace(u'.. [%s]' % r,
 74 |                                             u'.. [%s]' % new_r)
 75 | 
 76 |     reference_offset[0] += len(references)
 77 | 
 78 | 
 79 | def mangle_signature(app, what, name, obj,
 80 |                      options, sig, retann):
 81 |     # Do not try to inspect classes that don't define `__init__`
 82 |     if (inspect.isclass(obj) and
 83 |         (not hasattr(obj, '__init__') or
 84 |         'initializes x; see ' in pydoc.getdoc(obj.__init__))):
 85 |         return '', ''
 86 | 
 87 |     if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')):
 88 |         return
 89 |     if not hasattr(obj, '__doc__'):
 90 |         return
 91 | 
 92 |     doc = SphinxDocString(pydoc.getdoc(obj))
 93 |     if doc['Signature']:
 94 |         sig = re.sub(u"^[^(]*", u"", doc['Signature'])
 95 |         return sig, u''
 96 | 
 97 | 
 98 | def setup(app, get_doc_object_=get_doc_object):
 99 |     global get_doc_object
100 |     get_doc_object = get_doc_object_
101 | 
102 |     app.connect('autodoc-process-docstring', mangle_docstrings)
103 |     app.connect('autodoc-process-signature', mangle_signature)
104 |     app.add_config_value('numpydoc_edit_link', None, False)
105 |     app.add_config_value('numpydoc_use_plots', None, False)
106 |     app.add_config_value('numpydoc_show_class_members', True, True)
107 | 
108 |     # Extra mangling domains
109 |     app.add_domain(NumpyPythonDomain)
110 |     app.add_domain(NumpyCDomain)
111 | 
112 | #-----------------------------------------------------------------------------
113 | # Docstring-mangling domains
114 | #-----------------------------------------------------------------------------
115 | 
116 | from docutils.statemachine import ViewList
117 | from sphinx.domains.c import CDomain
118 | from sphinx.domains.python import PythonDomain
119 | 
120 | 
121 | class ManglingDomainBase(object):
122 |     directive_mangling_map = {}
123 | 
124 |     def __init__(self, *a, **kw):
125 |         super(ManglingDomainBase, self).__init__(*a, **kw)
126 |         self.wrap_mangling_directives()
127 | 
128 |     def wrap_mangling_directives(self):
129 |         for name, objtype in self.directive_mangling_map.items():
130 |             self.directives[name] = wrap_mangling_directive(
131 |                 self.directives[name], objtype)
132 | 
133 | 
134 | class NumpyPythonDomain(ManglingDomainBase, PythonDomain):
135 |     name = 'np'
136 |     directive_mangling_map = {
137 |         'function': 'function',
138 |         'class': 'class',
139 |         'exception': 'class',
140 |         'method': 'function',
141 |         'classmethod': 'function',
142 |         'staticmethod': 'function',
143 |         'attribute': 'attribute',
144 |     }
145 | 
146 | 
147 | class NumpyCDomain(ManglingDomainBase, CDomain):
148 |     name = 'np-c'
149 |     directive_mangling_map = {
150 |         'function': 'function',
151 |         'member': 'attribute',
152 |         'macro': 'function',
153 |         'type': 'class',
154 |         'var': 'object',
155 |     }
156 | 
157 | 
158 | def wrap_mangling_directive(base_directive, objtype):
159 |     class directive(base_directive):
160 |         def run(self):
161 |             env = self.state.document.settings.env
162 | 
163 |             name = None
164 |             if self.arguments:
165 |                 m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0])
166 |                 name = m.group(2).strip()
167 | 
168 |             if not name:
169 |                 name = self.arguments[0]
170 | 
171 |             lines = list(self.content)
172 |             mangle_docstrings(env.app, objtype, name, None, None, lines)
173 |             self.content = ViewList(lines, self.content.parent)
174 | 
175 |             return base_directive.run(self)
176 | 
177 |     return directive
178 | 


--------------------------------------------------------------------------------
/doc/sphinxext/numpy_ext_old/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/sphinxext/numpy_ext_old/__init__.py


--------------------------------------------------------------------------------
/doc/sphinxext/numpy_ext_old/docscrape_sphinx.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import inspect
  3 | import textwrap
  4 | import pydoc
  5 | 
  6 | from docscrape import NumpyDocString
  7 | from docscrape FunctionDoc
  8 | from docscrape ClassDoc
  9 | 
 10 | 
 11 | class SphinxDocString(NumpyDocString):
 12 |     # string conversion routines
 13 |     def _str_header(self, name, symbol='`'):
 14 |         return ['.. rubric:: ' + name, '']
 15 | 
 16 |     def _str_field_list(self, name):
 17 |         return [':' + name + ':']
 18 | 
 19 |     def _str_indent(self, doc, indent=4):
 20 |         out = []
 21 |         for line in doc:
 22 |             out += [' ' * indent + line]
 23 |         return out
 24 | 
 25 |     def _str_signature(self):
 26 |         return ['']
 27 |         if self['Signature']:
 28 |             return ['``%s``' % self['Signature']] + ['']
 29 |         else:
 30 |             return ['']
 31 | 
 32 |     def _str_summary(self):
 33 |         return self['Summary'] + ['']
 34 | 
 35 |     def _str_extended_summary(self):
 36 |         return self['Extended Summary'] + ['']
 37 | 
 38 |     def _str_param_list(self, name):
 39 |         out = []
 40 |         if self[name]:
 41 |             out += self._str_field_list(name)
 42 |             out += ['']
 43 |             for param, param_type, desc in self[name]:
 44 |                 out += self._str_indent(['**%s** : %s' % (param.strip(),
 45 |                                                           param_type)])
 46 |                 out += ['']
 47 |                 out += self._str_indent(desc, 8)
 48 |                 out += ['']
 49 |         return out
 50 | 
 51 |     def _str_section(self, name):
 52 |         out = []
 53 |         if self[name]:
 54 |             out += self._str_header(name)
 55 |             out += ['']
 56 |             content = textwrap.dedent("\n".join(self[name])).split("\n")
 57 |             out += content
 58 |             out += ['']
 59 |         return out
 60 | 
 61 |     def _str_see_also(self, func_role):
 62 |         out = []
 63 |         if self['See Also']:
 64 |             see_also = super(SphinxDocString, self)._str_see_also(func_role)
 65 |             out = ['.. seealso::', '']
 66 |             out += self._str_indent(see_also[2:])
 67 |         return out
 68 | 
 69 |     def _str_warnings(self):
 70 |         out = []
 71 |         if self['Warnings']:
 72 |             out = ['.. warning::', '']
 73 |             out += self._str_indent(self['Warnings'])
 74 |         return out
 75 | 
 76 |     def _str_index(self):
 77 |         idx = self['index']
 78 |         out = []
 79 |         if len(idx) == 0:
 80 |             return out
 81 | 
 82 |         out += ['.. index:: %s' % idx.get('default', '')]
 83 |         for section, references in idx.iteritems():
 84 |             if section == 'default':
 85 |                 continue
 86 |             elif section == 'refguide':
 87 |                 out += ['   single: %s' % (', '.join(references))]
 88 |             else:
 89 |                 out += ['   %s: %s' % (section, ','.join(references))]
 90 |         return out
 91 | 
 92 |     def _str_references(self):
 93 |         out = []
 94 |         if self['References']:
 95 |             out += self._str_header('References')
 96 |             if isinstance(self['References'], str):
 97 |                 self['References'] = [self['References']]
 98 |             out.extend(self['References'])
 99 |             out += ['']
100 |         return out
101 | 
102 |     def __str__(self, indent=0, func_role="obj"):
103 |         out = []
104 |         out += self._str_signature()
105 |         out += self._str_index() + ['']
106 |         out += self._str_summary()
107 |         out += self._str_extended_summary()
108 |         for param_list in ('Parameters', 'Attributes', 'Methods',
109 |                            'Returns', 'Raises'):
110 |             out += self._str_param_list(param_list)
111 |         out += self._str_warnings()
112 |         out += self._str_see_also(func_role)
113 |         out += self._str_section('Notes')
114 |         out += self._str_references()
115 |         out += self._str_section('Examples')
116 |         out = self._str_indent(out, indent)
117 |         return '\n'.join(out)
118 | 
119 | 
120 | class SphinxFunctionDoc(SphinxDocString, FunctionDoc):
121 |     pass
122 | 
123 | 
124 | class SphinxClassDoc(SphinxDocString, ClassDoc):
125 |     pass
126 | 
127 | 
128 | def get_doc_object(obj, what=None):
129 |     if what is None:
130 |         if inspect.isclass(obj):
131 |             what = 'class'
132 |         elif inspect.ismodule(obj):
133 |             what = 'module'
134 |         elif callable(obj):
135 |             what = 'function'
136 |         else:
137 |             what = 'object'
138 |     if what == 'class':
139 |         return SphinxClassDoc(obj, '', func_doc=SphinxFunctionDoc)
140 |     elif what in ('function', 'method'):
141 |         return SphinxFunctionDoc(obj, '')
142 |     else:
143 |         return SphinxDocString(pydoc.getdoc(obj))
144 | 


--------------------------------------------------------------------------------
/doc/sphinxext/numpy_ext_old/numpydoc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ========
  3 | numpydoc
  4 | ========
  5 | 
  6 | Sphinx extension that handles docstrings in the Numpy standard format. [1]
  7 | 
  8 | It will:
  9 | 
 10 | - Convert Parameters etc. sections to field lists.
 11 | - Convert See Also section to a See also entry.
 12 | - Renumber references.
 13 | - Extract the signature from the docstring, if it can't be determined otherwise.
 14 | 
 15 | .. [1] http://projects.scipy.org/scipy/numpy/wiki/CodingStyleGuidelines#docstring-standard
 16 | 
 17 | """
 18 | 
 19 | import os
 20 | import re
 21 | import pydoc
 22 | import inspect
 23 | 
 24 | from docscrape_sphinx import get_doc_object
 25 | from docscrape_sphinx import SphinxDocString
 26 | 
 27 | 
 28 | def mangle_docstrings(app, what, name, obj, options, lines,
 29 |                       reference_offset=[0]):
 30 |     if what == 'module':
 31 |         # Strip top title
 32 |         title_re = re.compile(r'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*',
 33 |                               re.I | re.S)
 34 |         lines[:] = title_re.sub('', "\n".join(lines)).split("\n")
 35 |     else:
 36 |         doc = get_doc_object(obj, what)
 37 |         lines[:] = str(doc).split("\n")
 38 | 
 39 |     if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \
 40 |            obj.__name__:
 41 |         v = dict(full_name=obj.__name__)
 42 |         lines += [''] + (app.config.numpydoc_edit_link % v).split("\n")
 43 | 
 44 |     # replace reference numbers so that there are no duplicates
 45 |     references = []
 46 |     for l in lines:
 47 |         l = l.strip()
 48 |         if l.startswith('.. ['):
 49 |             try:
 50 |                 references.append(int(l[len('.. ['):l.index(']')]))
 51 |             except ValueError:
 52 |                 print "WARNING: invalid reference in %s docstring" % name
 53 | 
 54 |     # Start renaming from the biggest number, otherwise we may
 55 |     # overwrite references.
 56 |     references.sort()
 57 |     if references:
 58 |         for i, line in enumerate(lines):
 59 |             for r in references:
 60 |                 new_r = reference_offset[0] + r
 61 |                 lines[i] = lines[i].replace('[%d]_' % r,
 62 |                                             '[%d]_' % new_r)
 63 |                 lines[i] = lines[i].replace('.. [%d]' % r,
 64 |                                             '.. [%d]' % new_r)
 65 | 
 66 |     reference_offset[0] += len(references)
 67 | 
 68 | 
 69 | def mangle_signature(app, what, name, obj, options, sig, retann):
 70 |     # Do not try to inspect classes that don't define `__init__`
 71 |     if (inspect.isclass(obj) and
 72 |         'initializes x; see ' in pydoc.getdoc(obj.__init__)):
 73 |         return '', ''
 74 | 
 75 |     if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')):
 76 |         return
 77 |     if not hasattr(obj, '__doc__'):
 78 |         return
 79 | 
 80 |     doc = SphinxDocString(pydoc.getdoc(obj))
 81 |     if doc['Signature']:
 82 |         sig = re.sub("^[^(]*", "", doc['Signature'])
 83 |         return sig, ''
 84 | 
 85 | 
 86 | def initialize(app):
 87 |     try:
 88 |         app.connect('autodoc-process-signature', mangle_signature)
 89 |     except:
 90 |         monkeypatch_sphinx_ext_autodoc()
 91 | 
 92 | 
 93 | def setup(app, get_doc_object_=get_doc_object):
 94 |     global get_doc_object
 95 |     get_doc_object = get_doc_object_
 96 | 
 97 |     app.connect('autodoc-process-docstring', mangle_docstrings)
 98 |     app.connect('builder-inited', initialize)
 99 |     app.add_config_value('numpydoc_edit_link', None, True)
100 | 
101 | #------------------------------------------------------------------------------
102 | # Monkeypatch sphinx.ext.autodoc to accept argspecless autodocs (Sphinx < 0.5)
103 | #------------------------------------------------------------------------------
104 | 
105 | 
106 | def monkeypatch_sphinx_ext_autodoc():
107 |     global _original_format_signature
108 |     import sphinx.ext.autodoc
109 | 
110 |     if sphinx.ext.autodoc.format_signature is our_format_signature:
111 |         return
112 | 
113 |     print "[numpydoc] Monkeypatching sphinx.ext.autodoc ..."
114 |     _original_format_signature = sphinx.ext.autodoc.format_signature
115 |     sphinx.ext.autodoc.format_signature = our_format_signature
116 | 
117 | 
118 | def our_format_signature(what, obj):
119 |     r = mangle_signature(None, what, None, obj, None, None, None)
120 |     if r is not None:
121 |         return r[0]
122 |     else:
123 |         return _original_format_signature(what, obj)
124 | 


--------------------------------------------------------------------------------
/doc/templates/class.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname }}
 2 | {{ underline }}
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 | 
 8 |    {% block methods %}
 9 |    .. automethod:: __init__
10 |    {% endblock %}
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/doc/templates/function.rst:
--------------------------------------------------------------------------------
1 | {{ fullname }}
2 | {{ underline }}
3 | 
4 | .. currentmodule:: {{ module }}
5 | 
6 | .. autofunction:: {{ objname }}
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/doc/themes/scikit-learn/layout.html:
--------------------------------------------------------------------------------
  1 | {#
  2 |     scikit-learn/layout.html
  3 |     ~~~~~~~~~~~~~~~~~
  4 | 
  5 |     Layout for scikit-learn, after a design made by Angel Soler
  6 |     (http://webylimonada.org)
  7 | 
  8 |     Update: Collapsable sidebar added - 13/03/2012 - Jaques Grobler
  9 |     Update: Next-page button added - 16/03/2012 - Jaques Grobler
 10 | 
 11 | 
 12 |     :copyright: Fabian Pedregosa
 13 |     :license: BSD
 14 | #}
 15 | {% extends "basic/layout.html" %}
 16 | 
 17 | {% if theme_collapsiblesidebar|tobool %}
 18 | {% set script_files = script_files + ['_static/sidebar.js'] %}
 19 | {% endif %}
 20 | 
 21 | {% block extrahead %}
 22 | 
 23 | <!-- Following code is for Google Analytics -->
 24 | <script type="text/javascript">
 25 | 
 26 |   var _gaq = _gaq || [];
 27 |   _gaq.push(['_setAccount', 'UA-35748160-1']);
 28 |   _gaq.push(['_trackPageview']);
 29 | 
 30 |   (function() {
 31 |     var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
 32 |     ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
 33 |     var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
 34 |   })();
 35 | 
 36 | </script>
 37 | {% endblock %}
 38 | 
 39 | {%- if pagename == 'index' %}
 40 |     {% set title = 'Machine Learning for Astronomy with Scikit-learn' %}
 41 | {%- endif %}
 42 | 
 43 | {% block header %}
 44 |    {%- if theme_oldversion == true %}
 45 |      <div class="warning-wrapper">
 46 |        <p>Warning: This documentation is
 47 |          for {{project}} <strong>version {{ release|e }}</strong>.
 48 |          &mdash; <a href="http://nisl.github.com">
 49 |            Latest stable version</a></p></p>
 50 |      </div>
 51 |    {%- endif %}
 52 | 
 53 |     <div class="header-wrapper">
 54 |       <div class="header">
 55 |         {%- if logo %}
 56 |           <p class="logo"><a href="{{ pathto(master_doc) }}">
 57 |             <img src="{{ pathto('_static/' + logo, 1) }}" alt="Logo"/>
 58 |           </a>
 59 |           </p>
 60 |         {%- endif %}
 61 | 
 62 |           {%- block navbar -%}
 63 |           <div class="navbar">
 64 |           <ul>
 65 |           
 66 | 	    <li><a href="{{pathto('general_concepts')}}">Machine learning 101</a></li>
 67 |             <li><a href="{{pathto('classification')}}">Classification</a></li>
 68 |             <li><a href="{{pathto('regression')}}">Regression</a></li>
 69 |             <li><a href="{{pathto('exercises')}}">Exercises</a></li>
 70 |             <li><a href="{{pathto('auto_examples/index')}}">Examples</a></li>
 71 |           
 72 | 	  
 73 |        </ul>
 74 | 
 75 | <!--
 76 | <div class="search_form">
 77 | 
 78 | <div id="cse" style="width: 100%;"></div>
 79 | <script src="http://www.google.com/jsapi" type="text/javascript"></script>
 80 | <script type="text/javascript">
 81 |   google.load('search', '1', {language : 'en'});
 82 |   google.setOnLoadCallback(function() {
 83 |     var customSearchControl = new google.search.CustomSearchControl('016639176250731907682:tjtqbvtvij0');
 84 |     customSearchControl.setResultSetSize(google.search.Search.FILTERED_CSE_RESULTSET);
 85 |     var options = new google.search.DrawOptions();
 86 |     options.setAutoComplete(true);
 87 |     customSearchControl.draw('cse', options);
 88 |   }, true);
 89 | </script>
 90 | 
 91 | </div>
 92 | -->
 93 |           </div> <!-- end navbar -->
 94 |           {%- endblock -%}
 95 | 
 96 | 
 97 |        </div>
 98 |     </div>
 99 | {% endblock %}
100 | 
101 | {% block content %}
102 |     <div class="content-wrapper">
103 | 
104 |       <div class="sphinxsidebar">
105 | 	<div class="sphinxsidebarwrapper">
106 | 	  {%- if pagename != 'index' %}
107 | 	  {%- if parents %}
108 | 	  <div class="rel">
109 | 	   {% else %}
110 | 	   <div class="rel rellarge">
111 | 	     {% endif %}
112 | 	<!-- XXX: when we have a 'module index' that appears in the link
113 | 	     bar, we will need to use the following ugly hack to avoid it
114 | 	     rellinks[1:]|reverse
115 | 	    -->
116 | 	{%- for rellink in rellinks|reverse %}
117 | 	<div class="rellink">
118 | 	<a href="{{ pathto(rellink[0]) }}" title="{{ rellink[1]|striptags|e }}"
119 | 	    {{ accesskey(rellink[2]) }}>{{ rellink[3]|capitalize }}
120 | 	    <br>
121 | 	    <span class="smallrellink">
122 | 	    {{ rellink[1]|striptags|truncate(16, killwords=True) }}
123 | 	    </span>
124 | 	    {%- if rellink[1]|striptags %}
125 | 	    <span class="hiddenrellink">
126 | 	    {{ rellink[1]|striptags}}
127 | 	    </span>
128 | 	    {% endif %}
129 | 	    </a>
130 | 	</div>
131 | 	{%- if not loop.last %}
132 | 	    <div class="spacer">
133 | 	    &nbsp;
134 | 	    </div>
135 | 	{% endif %}
136 | 	{%- endfor %}
137 | 	<!-- Ad a link to the 'up' page -->
138 | 	{%- if parents %}
139 | 	<div class="spacer">
140 | 	&nbsp;
141 | 	</div>
142 | 	<div class="rellink">
143 | 	<a href="{{ parents[-1].link|e }}" title="{{ parents[-1].title|striptags }}" >
144 | 	Up
145 | 	<br>
146 | 	<span class="smallrellink">
147 | 	{{ parents[-1].title|striptags|truncate(16, killwords=True) }}
148 | 	</span>
149 | 	{%- if parents[-1].title|striptags != '<no title>' %}
150 | 	<span class="hiddenrellink">
151 | 	{{ parents[-1].title|striptags }}
152 | 	</span>
153 | 	{% endif %}
154 | 	</a>
155 | 	</div>
156 |     {%- endif %}
157 |     </div>
158 |     {%- endif %}
159 | 
160 |     {%- if (pagename != 'index') %}
161 |     <h3>{{ _('This page') }}</h3>
162 | 	{{ toc }}
163 |     {% else %}
164 |     <h3>News</h3>
165 |     <p>scikit-learn 0.12 was released September 2012.  Find out more at
166 |       <a href="http://scikit-learn.org">http://scikit-learn.org</a>.</p>
167 |     <p>astroML 0.1 was released October 2012.  Find out more at
168 |       <a href="http://astroML.github.com">http://astroML.github.com</a>.</p>
169 | 
170 |     <h3>Video Links</h3>
171 |     <p><a href="http://pyvideo.org/video/972/tutorial-scikit-learn-machine-learning-python">PyData 2012</a>: 75-minute version of this tutorial</p>
172 |     <p><a href="http://www.youtube.com/watch?v=33L_EXLtJPE&feature=plcp">Scipy 2012</a>: a 3-hour version of this tutorial</p>
173 |     <p><a href="http://vimeo.com/53062607">PyData NYC 2012</a>: 45-minute
174 |       version of this tutorial</p>
175 | 
176 |     <h3>Licensing</h3>
177 |     <p>All material Open source: <strong>BSD license</strong> (3 clause).</p>
178 | 
179 |     <h3>About</h3>
180 |     <p><a href="{{pathto('AUTHORS')}}">Authors</a></p>
181 | 
182 |     {% endif %}
183 |     <h3>Giving credit</h3>
184 |     <p>Please consider <a href="{{pathto('AUTHORS')}}#citing">citing the 
185 |     scikit-learn</a> if you use it.</p>
186 | 
187 |     </div>
188 | 	  </div>
189 | 
190 | 
191 |       <div class="content">
192 |           {%- block document %}
193 |             {{ super() }}
194 |           {%- endblock %}
195 |         <div class="clearer"></div>
196 |       </div>
197 |     </div>
198 |   
199 | {% endblock %}
200 | 
201 | 
202 | {% block relbar1 %}{% endblock %}
203 | {% block relbar2 %}{% endblock %}
204 | 
205 | 
206 | {%- block footer %}
207 |     <div class="footer">
208 |     {%- if show_copyright %}
209 |       {%- if hasdoc('copyright') %}
210 |         {% trans path=pathto('copyright'), copyright=copyright|e %}&copy; {{ copyright }}.{% endtrans %}
211 |       {%- else %}
212 |         {% trans copyright=copyright|e %}&copy; {{ copyright }}.{% endtrans %}
213 |       {%- endif %}
214 |     {%- endif %}
215 |     {%- if last_updated %}
216 |       {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %}
217 |     {%- endif %}
218 |     {%- if show_sphinx %}
219 |       {% trans sphinx_version=sphinx_version|e %}Created using <a href="http://sphinx.pocoo.org/">Sphinx</a> {{ sphinx_version }}{% endtrans %}. Design by <a href="http://webylimonada.com">Web y Limonada</a>.
220 |     {%- endif %}
221 |     {%- if show_source and has_source and sourcename %}
222 |     <span style="padding-left: 5ex;">
223 |     <a href="{{ pathto('_sources/' + sourcename, true)|e }}"
224 | 	    rel="nofollow">{{ _('Show this page source') }}</a>
225 |     </span>
226 |     {%- endif %}
227 |     </div>
228 | 
229 |     {%- if pagename != 'index' %}
230 |     {%- if parents %}
231 |      <div class="rel">
232 |     {% else %}
233 |      <div class="rel rellarge">
234 |     {% endif %}
235 | 	<!-- XXX: when we have a 'module index' that appears in the link
236 | 	     bar, we will need to use the following ugly hack to avoid it
237 | 	rellinks[1:]|reverse
238 | 	-->
239 |     {%- for rellink in rellinks|reverse %}
240 |     <div class="{{ loop.cycle('buttonPrevious', 'buttonNext') }}">
241 |       <a href="{{ pathto(rellink[0]) }}">
242 |         {{loop.cycle('Previous', 'Next')}}
243 |       </a>  
244 |     </div>
245 |     {%- endfor %}
246 |     {% endif %}
247 |      </div>
248 |      <script type="text/javascript">
249 |        $("div.buttonNext, div.buttonPrevious").hover(
250 |            function () {
251 |                $(this).css('background-color', '#FF9C34');
252 |            },
253 |            function () {
254 |                $(this).css('background-color', '#A7D6E2');
255 |            }
256 |        );
257 |      </script>
258 | {%- endblock %}
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/doc/themes/scikit-learn/static/sidebar.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * sidebar.js
  3 |  * ~~~~~~~~~~
  4 |  *
  5 |  * This script makes the Sphinx sidebar collapsible.
  6 |  *
  7 |  * .sphinxsidebar contains .sphinxsidebarwrapper.  This script adds
  8 |  * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton
  9 |  * used to collapse and expand the sidebar.
 10 |  *
 11 |  * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden
 12 |  * and the width of the sidebar and the margin-left of the document
 13 |  * are decreased. When the sidebar is expanded the opposite happens.
 14 |  * This script saves a per-browser/per-session cookie used to
 15 |  * remember the position of the sidebar among the pages.
 16 |  * Once the browser is closed the cookie is deleted and the position
 17 |  * reset to the default (expanded).
 18 |  *
 19 |  * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
 20 |  * :license: BSD, see LICENSE for details.
 21 |  *
 22 |  */
 23 | 
 24 | $(function() {
 25 |   // global elements used by the functions.
 26 |   // the 'sidebarbutton' element is defined as global after its
 27 |   // creation, in the add_sidebar_button function
 28 |   var bodywrapper = $('.bodywrapper');
 29 |   var sidebar = $('.sphinxsidebar');
 30 |   var sidebarwrapper = $('.sphinxsidebarwrapper');
 31 | 
 32 |   // for some reason, the document has no sidebar; do not run into errors
 33 |   if (!sidebar.length) return;
 34 | 
 35 |   // original margin-left of the bodywrapper and width of the sidebar
 36 |   // with the sidebar expanded
 37 |   var bw_margin_expanded = bodywrapper.css('margin-left');
 38 |   var ssb_width_expanded = sidebar.width();
 39 | 
 40 |   // margin-left of the bodywrapper and width of the sidebar
 41 |   // with the sidebar collapsed
 42 |   var bw_margin_collapsed = '-190px';
 43 |   var ssb_width_collapsed = '1.0em';
 44 | 
 45 |   // colors used by the current theme
 46 |   var dark_color = $('.related').css('background-color');
 47 |   var light_color = $('.footer').css('color');
 48 | 
 49 |   function sidebar_is_collapsed() {
 50 |     return sidebarwrapper.is(':not(:visible)');
 51 |   }
 52 | 
 53 |   function toggle_sidebar() {
 54 |     if (sidebar_is_collapsed())
 55 |       expand_sidebar();
 56 |     else
 57 |       collapse_sidebar();
 58 |   }
 59 | 
 60 |   function collapse_sidebar() {
 61 |     sidebarwrapper.hide();
 62 |     sidebar.css('width', ssb_width_collapsed);
 63 |     bodywrapper.css('margin-left', bw_margin_collapsed);
 64 |     sidebarbutton.css({
 65 |         'margin-left': '0',
 66 |         'height': bodywrapper.height()
 67 |     });
 68 |     sidebarbutton.find('span').text('»');
 69 |     sidebarbutton.attr('title', _('Expand sidebar'));
 70 |     document.cookie = 'sidebar=collapsed';
 71 |   }
 72 | 
 73 |   function expand_sidebar() {
 74 |     bodywrapper.css('margin-left', bw_margin_expanded);
 75 |     sidebar.css('width', ssb_width_expanded);
 76 |     sidebarwrapper.show();
 77 |     sidebarbutton.css({
 78 |         'margin-left': ssb_width_expanded-13,
 79 |         'height': bodywrapper.height()
 80 |     });
 81 |     sidebarbutton.find('span').text('«');
 82 |     sidebarbutton.attr('title', _('Collapse sidebar'));
 83 |     document.cookie = 'sidebar=expanded';
 84 |   }
 85 | 
 86 |   function add_sidebar_button() {
 87 |     sidebarwrapper.css({
 88 |         'float': 'left' ,
 89 |         'margin-right': '0',
 90 |         'width': ssb_width_expanded - 13
 91 |     });
 92 |     // create the button
 93 |     sidebar.append(
 94 |         '<div id="sidebarbutton"><span>&laquo;</span></div>'
 95 |     );
 96 |     var sidebarbutton = $('#sidebarbutton');
 97 |     light_color = sidebarbutton.css('background-color');
 98 |     // find the height of the viewport to center the '<<' in the page
 99 |     var viewport_height;
100 |     if (window.innerHeight)
101 |  	  viewport_height = window.innerHeight;
102 |     else
103 | 	  viewport_height = $(window).height();
104 |     sidebarbutton.find('span').css({
105 |         'display': 'block',
106 |         'margin-top': (viewport_height - sidebar.position().top + 60) / 2
107 |     });
108 | 
109 |     sidebarbutton.click(toggle_sidebar);
110 |     sidebarbutton.attr('title', _('Collapse sidebar'));
111 |     sidebarbutton.css({
112 |         'border-left': '1px solid ' + dark_color,
113 | 	'border-top-left-radius' : '15px',
114 |         'font-size': '1.2em',
115 |         'cursor': 'pointer',
116 |         'height': bodywrapper.height(),
117 |         'padding-top': '1px',
118 |         'margin-left': ssb_width_expanded - 12
119 |     });
120 | 
121 |     sidebarbutton.hover(
122 |       function () {
123 |           $(this).css('background-color', '#D0D0D0');
124 |       },
125 |       function () {
126 |           $(this).css('background-color', '#F0F0F0');
127 |       }
128 |     );
129 |   }
130 | 
131 |   function set_position_from_cookie() {
132 |     if (!document.cookie)
133 |       return;
134 |     var items = document.cookie.split(';');
135 |     for(var k=0; k<items.length; k++) {
136 |       var key_val = items[k].split('=');
137 |       var key = key_val[0];
138 |       if (key == 'sidebar') {
139 |         var value = key_val[1];
140 |         if ((value == 'collapsed') && (!sidebar_is_collapsed()))
141 |           collapse_sidebar();
142 |         else if ((value == 'expanded') && (sidebar_is_collapsed()))
143 |           expand_sidebar();
144 |       }
145 |     }
146 |   }
147 | 
148 |   add_sidebar_button();
149 |   var sidebarbutton = $('#sidebarbutton');
150 |   set_position_from_cookie();
151 | });
152 | 


--------------------------------------------------------------------------------
/doc/themes/scikit-learn/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = basic
3 | stylesheet = nature.css
4 | pygments_style = tango
5 | 
6 | [options]
7 | oldversion = False
8 | collapsiblesidebar = True


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | Applications to experiment with scikit-learn tools.
2 | 


--------------------------------------------------------------------------------
/examples/plot_ML_flow_chart.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tutorial Diagrams
  3 | -----------------
  4 | 
  5 | This script plots the flow-charts used in the scikit-learn tutorials.
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pylab as pl
 10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow
 11 | 
 12 | def create_base(box_bg = '#CCCCCC',
 13 |                 arrow1 = '#88CCFF',
 14 |                 arrow2 = '#88FF88',
 15 |                 supervised=True):
 16 |     fig = pl.figure(figsize=(9, 6), facecolor='w')
 17 |     ax = pl.axes((0, 0, 1, 1),
 18 |                  xticks=[], yticks=[], frameon=False)
 19 |     ax.set_xlim(0, 9)
 20 |     ax.set_ylim(0, 6)
 21 | 
 22 |     patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg),
 23 |                Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg),
 24 |                Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg),
 25 |                
 26 |                Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg),
 27 |                Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg),
 28 |                Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg),
 29 |                
 30 |                Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg),
 31 |                
 32 |                Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg),
 33 |                
 34 |                Circle((5.5, 3.5), 1.0, fc=box_bg),
 35 |                
 36 |                Polygon([[5.5, 1.7],
 37 |                         [6.1, 1.1],
 38 |                         [5.5, 0.5],
 39 |                         [4.9, 1.1]], fc=box_bg),
 40 |                
 41 |                FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1,
 42 |                           width=0.25, head_width=0.5, head_length=0.2),
 43 |                
 44 |                FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1,
 45 |                           width=0.25, head_width=0.5, head_length=0.2),
 46 |                
 47 |                FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1,
 48 |                           width=0.25, head_width=0.5, head_length=0.2),
 49 |                
 50 |                FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2,
 51 |                           width=0.25, head_width=0.5, head_length=0.2),
 52 |                
 53 |                FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2,
 54 |                           width=0.25, head_width=0.5, head_length=0.2),
 55 |                
 56 |                FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2,
 57 |                           width=0.25, head_width=0.5, head_length=0.2)]
 58 | 
 59 |     if supervised:
 60 |         patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg),
 61 |                     Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg),
 62 |                     Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg),
 63 |                     FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1,
 64 |                                width=0.25, head_width=0.5, head_length=0.2),
 65 |                     Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)]
 66 |     else:
 67 |         patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)]
 68 |     
 69 |     for p in patches:
 70 |         ax.add_patch(p)
 71 |         
 72 |     pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.",
 73 |             ha='center', va='center', fontsize=14)
 74 |     
 75 |     pl.text(3.6, 4.9, "Feature\nVectors", 
 76 |             ha='left', va='center', fontsize=14)
 77 |     
 78 |     pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm",
 79 |             ha='center', va='center', fontsize=14)
 80 |     
 81 |     pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.",
 82 |             ha='center', va='center', fontsize=14)
 83 |     
 84 |     pl.text(3.3, 1.7, "Feature\nVector", 
 85 |             ha='left', va='center', fontsize=14)
 86 |     
 87 |     pl.text(5.5, 1.1, "Predictive\nModel", 
 88 |             ha='center', va='center', fontsize=12)
 89 | 
 90 |     if supervised:
 91 |         pl.text(1.45, 3.05, "Labels",
 92 |                 ha='center', va='center', fontsize=14)
 93 |     
 94 |         pl.text(8.05, 1.1, "Expected\nLabel",
 95 |                 ha='center', va='center', fontsize=14)
 96 |         pl.text(8.8, 5.8, "Supervised Learning Model",
 97 |                 ha='right', va='top', fontsize=18)
 98 | 
 99 |     else:
100 |         pl.text(8.05, 1.1,
101 |                 "Likelihood\nor Cluster ID\nor Better\nRepresentation",
102 |                 ha='center', va='center', fontsize=12)
103 |         pl.text(8.8, 5.8, "Unsupervised Learning Model",
104 |                 ha='right', va='top', fontsize=18)
105 |         
106 |         
107 | 
108 | def plot_supervised(annotate=False):
109 |     create_base(supervised=True)
110 |     if annotate:
111 |         fontdict = dict(color='r', weight='bold', size=14)
112 |         pl.text(1.9, 4.55, 'X = vec.fit_transform(input)',
113 |                 fontdict=fontdict,
114 |                 rotation=20, ha='left', va='bottom')
115 |         pl.text(3.7, 3.2, 'clf.fit(X, y)',
116 |                 fontdict=fontdict,
117 |                 rotation=20, ha='left', va='bottom')
118 |         pl.text(1.7, 1.5, 'X_new = vec.transform(input)',
119 |                 fontdict=fontdict,
120 |                 rotation=20, ha='left', va='bottom')
121 |         pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)',
122 |                 fontdict=fontdict,
123 |                 rotation=20, ha='left', va='bottom')
124 | 
125 | def plot_unsupervised():
126 |     create_base(supervised=False)
127 | 
128 | 
129 | plot_supervised(False)
130 | plot_supervised(True)
131 | plot_unsupervised()
132 | pl.show()
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/examples/plot_bias_variance_examples.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Bias and Variance
  3 | -----------------
  4 | 
  5 | This script plots some simple examples of how model complexity affects
  6 | bias and variance.
  7 | """
  8 | 
  9 | import numpy as np
 10 | import pylab as pl
 11 | from matplotlib import ticker
 12 | from matplotlib.patches import FancyArrow
 13 | 
 14 | # Suppress warnings from Polyfit (ill-conditioned fit)
 15 | import warnings
 16 | warnings.filterwarnings('ignore', message='Polyfit*')
 17 | 
 18 | np.random.seed(42)
 19 | 
 20 | def test_func(x, err=0.5):
 21 |     return np.random.normal(10 - 1. / (x + 0.1), err)
 22 | 
 23 | 
 24 | def compute_error(x, y, p):
 25 |     yfit = np.polyval(p, x)
 26 |     return np.sqrt(np.mean((y - yfit) ** 2))
 27 | 
 28 | 
 29 | #------------------------------------------------------------
 30 | # Plot linear regression example
 31 | np.random.seed(42)
 32 | x = np.random.random(20)
 33 | y = np.sin(2 * x)
 34 | p = np.polyfit(x, y, 1)  # fit a 1st-degree polynomial to the data
 35 | 
 36 | xfit = np.linspace(-0.2, 1.2, 10)
 37 | yfit = np.polyval(p, xfit)
 38 | 
 39 | pl.scatter(x, y, c='k')
 40 | pl.plot(xfit, yfit)
 41 | pl.xlabel('x')
 42 | pl.ylabel('y')
 43 | pl.title('Linear Regression Example')
 44 | 
 45 | #------------------------------------------------------------
 46 | # Plot example of over-fitting and under-fitting
 47 | 
 48 | N = 8
 49 | np.random.seed(42)
 50 | x = 10 ** np.linspace(-2, 0, N)
 51 | y = test_func(x)
 52 | 
 53 | xfit = np.linspace(-0.2, 1.2, 1000)
 54 | 
 55 | titles = ['d = 1 (under-fit)', 'd = 2', 'd = 6 (over-fit)']
 56 | degrees = [1, 2, 6]
 57 | 
 58 | pl.figure(figsize = (9, 3.5))
 59 | for i, d in enumerate(degrees):
 60 |     pl.subplot(131 + i, xticks=[], yticks=[])
 61 |     pl.scatter(x, y, marker='x', c='k', s=50)
 62 | 
 63 |     p = np.polyfit(x, y, d)
 64 |     yfit = np.polyval(p, xfit)
 65 |     pl.plot(xfit, yfit, '-b')
 66 |     
 67 |     pl.xlim(-0.2, 1.2)
 68 |     pl.ylim(0, 12)
 69 |     pl.xlabel('house size')
 70 |     if i == 0:
 71 |         pl.ylabel('price')
 72 | 
 73 |     pl.title(titles[i])
 74 | 
 75 | pl.subplots_adjust(left = 0.06, right=0.98,
 76 |                    bottom=0.15, top=0.85,
 77 |                    wspace=0.05)
 78 | 
 79 | #------------------------------------------------------------
 80 | # Plot training error and cross-val error
 81 | #   as a function of polynomial degree
 82 | 
 83 | Ntrain = 100
 84 | Ncrossval = 100
 85 | error = 1.0
 86 | 
 87 | np.random.seed(0)
 88 | x = np.random.random(Ntrain + Ncrossval)
 89 | y = test_func(x, error)
 90 | 
 91 | xtrain = x[:Ntrain]
 92 | ytrain = y[:Ntrain]
 93 | 
 94 | xcrossval = x[Ntrain:]
 95 | ycrossval = y[Ntrain:]
 96 | 
 97 | degrees = np.arange(1, 21)
 98 | train_err = np.zeros(len(degrees))
 99 | crossval_err = np.zeros(len(degrees))
100 | 
101 | for i, d in enumerate(degrees):
102 |     p = np.polyfit(xtrain, ytrain, d)
103 | 
104 |     train_err[i] = compute_error(xtrain, ytrain, p)
105 |     crossval_err[i] = compute_error(xcrossval, ycrossval, p)
106 | 
107 | pl.figure()
108 | pl.title('Error for 100 Training Points')
109 | pl.plot(degrees, crossval_err, lw=2, label = 'cross-validation error')
110 | pl.plot(degrees, train_err, lw=2, label = 'training error')
111 | pl.plot([0, 20], [error, error], '--k', label='intrinsic error')
112 | pl.legend()
113 | pl.xlabel('degree of fit')
114 | pl.ylabel('rms error')
115 | 
116 | pl.gca().add_patch(FancyArrow(5, 1.35, -3, 0, width = 0.01,
117 |                               head_width=0.04, head_length=1.0,
118 |                               length_includes_head=True))
119 | pl.text(5.3, 1.35, "High Bias", fontsize=18, va='center')
120 | 
121 | pl.gca().add_patch(FancyArrow(19, 1.22, 0, -0.1, width = 0.25,
122 |                               head_width=1.0, head_length=0.05,
123 |                               length_includes_head=True))
124 | pl.text(19.8, 1.23, "High Variance", ha='right', fontsize=18)
125 | 
126 | #------------------------------------------------------------
127 | # Plot training error and cross-val error
128 | #   as a function of training set size
129 | 
130 | Ntrain = 100
131 | Ncrossval = 100
132 | error = 1.0
133 | 
134 | np.random.seed(0)
135 | x = np.random.random(Ntrain + Ncrossval)
136 | y = test_func(x, error)
137 | 
138 | xtrain = x[:Ntrain]
139 | ytrain = y[:Ntrain]
140 | 
141 | xcrossval = x[Ntrain:]
142 | ycrossval = y[Ntrain:]
143 | 
144 | sizes = np.linspace(2, Ntrain, 50).astype(int)
145 | train_err = np.zeros(sizes.shape)
146 | crossval_err = np.zeros(sizes.shape)
147 | 
148 | pl.figure(figsize=(10, 5))
149 | 
150 | for j,d in enumerate((1, 20)):
151 |     for i, size in enumerate(sizes):
152 |         p = np.polyfit(xtrain[:size], ytrain[:size], d)
153 |         crossval_err[i] = compute_error(xcrossval, ycrossval, p)
154 |         train_err[i] = compute_error(xtrain[:size], ytrain[:size], p)
155 | 
156 |     ax = pl.subplot(121 + j)
157 |     pl.plot(sizes, crossval_err, lw=2, label='cross-val error')
158 |     pl.plot(sizes, train_err, lw=2, label='training error')
159 |     pl.plot([0, Ntrain], [error, error], '--k', label='intrinsic error')
160 | 
161 |     pl.xlabel('traning set size')
162 |     if j == 0:
163 |         pl.ylabel('rms error')
164 |     else:
165 |         ax.yaxis.set_major_formatter(ticker.NullFormatter())
166 |     
167 |     pl.legend(loc = 4)
168 |     
169 |     pl.ylim(0.0, 2.5)
170 |     pl.xlim(0, 99)
171 | 
172 |     pl.text(98, 2.45, 'd = %i' % d, ha='right', va='top', fontsize='large')
173 | 
174 | pl.subplots_adjust(wspace = 0.02, left=0.07, right=0.95,
175 |                    bottom=0.1, top=0.9)
176 | pl.show()
177 | 


--------------------------------------------------------------------------------
/examples/plot_gui_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Linear Model Example
  3 | --------------------
  4 | 
  5 | This is an example plot from the tutorial which accompanies an explanation
  6 | of the support vector machine GUI.
  7 | """
  8 | 
  9 | import numpy as np
 10 | import pylab as pl
 11 | import matplotlib
 12 | 
 13 | from sklearn import svm
 14 | 
 15 | 
 16 | def linear_model(rseed=42, Npts=30):
 17 |     np.random.seed(rseed)
 18 | 
 19 | 
 20 |     data = np.random.normal(0, 10, (Npts, 2))
 21 |     data[:Npts / 2] -= 15
 22 |     data[Npts / 2:] += 15
 23 | 
 24 |     labels = np.ones(Npts)
 25 |     labels[:Npts / 2] = -1
 26 | 
 27 |     return data, labels
 28 | 
 29 | 
 30 | def nonlinear_model(rseed=42, Npts=30):
 31 |     radius = 40 * np.random.random(Npts)
 32 |     far_pts = radius > 20
 33 |     radius[far_pts] *= 1.2
 34 |     radius[~far_pts] *= 1.1
 35 | 
 36 |     theta = np.random.random(Npts) * np.pi * 2
 37 | 
 38 |     data = np.empty((Npts, 2))
 39 |     data[:, 0] = radius * np.cos(theta)
 40 |     data[:, 1] = radius * np.sin(theta)
 41 | 
 42 |     labels = np.ones(Npts)
 43 |     labels[far_pts] = -1
 44 | 
 45 |     return data, labels
 46 | 
 47 | #------------------------------------------------------------
 48 | # Linear model
 49 | X, y = linear_model()
 50 | clf = svm.SVC(kernel='linear',
 51 |               gamma=0.01, coef0=0, degree=3)
 52 | clf.fit(X, y)
 53 | 
 54 | fig = pl.figure()
 55 | ax = pl.subplot(111, xticks=[], yticks=[])
 56 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=pl.cm.bone)
 57 | 
 58 | ax.scatter(clf.support_vectors_[:, 0],
 59 |            clf.support_vectors_[:, 1],
 60 |            s=80, edgecolors="k", facecolors="none")
 61 | 
 62 | delta = 1
 63 | y_min, y_max = -50, 50
 64 | x_min, x_max = -50, 50
 65 | x = np.arange(x_min, x_max + delta, delta)
 66 | y = np.arange(y_min, y_max + delta, delta)
 67 | X1, X2 = np.meshgrid(x, y)
 68 | Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()])
 69 | Z = Z.reshape(X1.shape)
 70 | 
 71 | levels = [-1.0, 0.0, 1.0]
 72 | linestyles = ['dashed', 'solid', 'dashed']
 73 | colors = 'k'
 74 | ax.contour(X1, X2, Z, levels,
 75 |            colors=colors,
 76 |            linestyles=linestyles)
 77 | 
 78 | 
 79 | #------------------------------------------------------------
 80 | # RBF model
 81 | X, y = nonlinear_model()
 82 | clf = svm.SVC(kernel='rbf',
 83 |               gamma=0.001, coef0=0, degree=3)
 84 | clf.fit(X, y)
 85 | 
 86 | fig = pl.figure()
 87 | ax = pl.subplot(111, xticks=[], yticks=[])
 88 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=pl.cm.bone, zorder=2)
 89 | 
 90 | ax.scatter(clf.support_vectors_[:, 0],
 91 |            clf.support_vectors_[:, 1],
 92 |            s=80, edgecolors="k", facecolors="none")
 93 | 
 94 | delta = 1
 95 | y_min, y_max = -50, 50
 96 | x_min, x_max = -50, 50
 97 | x = np.arange(x_min, x_max + delta, delta)
 98 | y = np.arange(y_min, y_max + delta, delta)
 99 | X1, X2 = np.meshgrid(x, y)
100 | Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()])
101 | Z = Z.reshape(X1.shape)
102 | 
103 | levels = [-1.0, 0.0, 1.0]
104 | linestyles = ['dashed', 'solid', 'dashed']
105 | colors = 'k'
106 | 
107 | ax.contourf(X1, X2, Z, 10,
108 |             cmap=matplotlib.cm.bone,
109 |             origin='lower',
110 |             alpha=0.85, zorder=1)
111 | ax.contour(X1, X2, Z, [0.0],
112 |            colors='k',
113 |            linestyles=['solid'], zorder=1)
114 | 
115 | pl.show()
116 |     
117 | 


--------------------------------------------------------------------------------
/examples/plot_iris_projections.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Iris Projections
 3 | ----------------
 4 | 
 5 | This code generates the Iris projection example plots found in the tutorial
 6 | """
 7 | 
 8 | from itertools import cycle
 9 | import pylab as pl
10 | 
11 | from sklearn.datasets import load_iris
12 | from sklearn.decomposition import PCA
13 | 
14 | 
15 | def plot_2D(data, target, target_names):
16 |     colors = cycle('rgbcmykw')
17 |     target_ids = range(len(target_names))
18 |     pl.figure()
19 |     for i, c, label in zip(target_ids, colors, target_names):
20 |         pl.plot(data[target == i, 0],
21 |                 data[target == i, 1], 'o',
22 |                 c=c, label=label)
23 |     pl.legend(target_names)
24 | 
25 | #----------------------------------------------------------------------
26 | # Load iris data
27 | iris = load_iris()
28 | X, y = iris.data, iris.target
29 | 
30 | 
31 | #----------------------------------------------------------------------
32 | # First figure: PCA
33 | pca = PCA(n_components=2, whiten=True).fit(X)
34 | X_pca = pca.transform(X)
35 | plot_2D(X_pca, iris.target, iris.target_names)
36 | 
37 | 
38 | #----------------------------------------------------------------------
39 | # Second figure: Kmeans labels
40 | from sklearn.cluster import KMeans
41 | from numpy.random import RandomState
42 | rng = RandomState(42)
43 | kmeans = KMeans(3, random_state=rng).fit(X_pca)
44 | plot_2D(X_pca, kmeans.labels_, ["c0", "c1", "c2"])
45 | 
46 | 
47 | pl.show()
48 | 


--------------------------------------------------------------------------------
/examples/plot_python_101.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Basic numerics and plotting with Python
 3 | ========================================
 4 | 
 5 | """
 6 | 
 7 | # import numpy: the module providing numerical arrays
 8 | import numpy as np
 9 | t = np.linspace(1, 10, 2000)
10 | 
11 | # import pylab: the module for scientific plotting
12 | import pylab as pl
13 | pl.plot(t, np.cos(t))
14 | 


--------------------------------------------------------------------------------
/examples/plot_sdss_filters.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SDSS Filters
  3 | ------------
  4 | 
  5 | This example downloads and plots the filters from the Sloan Digital Sky
  6 | Survey, along with a reference spectrum.
  7 | """
  8 | import os
  9 | import urllib2
 10 | 
 11 | import numpy as np
 12 | import pylab as pl
 13 | from matplotlib.patches import Arrow
 14 | 
 15 | REFSPEC_URL = 'ftp://ftp.stsci.edu/cdbs/current_calspec/1732526_nic_002.ascii'
 16 | URL = 'http://www.sdss.org/dr7/instruments/imager/filters/%s.dat'
 17 | 
 18 | def fetch_filter(filt):
 19 |     assert filt in 'ugriz'
 20 |     url = URL % filt
 21 |     
 22 |     if not os.path.exists('downloads'):
 23 |         os.makedirs('downloads')
 24 | 
 25 |     loc = os.path.join('downloads', '%s.dat' % filt)
 26 |     if not os.path.exists(loc):
 27 |         print "downloading from %s" % url
 28 |         F = urllib2.urlopen(url)
 29 |         open(loc, 'w').write(F.read())
 30 | 
 31 |     F = open(loc)
 32 |         
 33 |     data = np.loadtxt(F)
 34 |     return data
 35 | 
 36 | def fetch_vega_spectrum():
 37 |     if not os.path.exists('downloads'):
 38 |         os.makedirs('downloads')
 39 | 
 40 |     refspec_file = os.path.join('downloads', REFSPEC_URL.split('/')[-1])
 41 | 
 42 |     if  not os.path.exists(refspec_file):
 43 |         print "downloading from %s" % REFSPEC_URL
 44 |         F = urllib2.urlopen(REFSPEC_URL)
 45 |         open(refspec_file, 'w').write(F.read())
 46 | 
 47 |     F = open(refspec_file)
 48 | 
 49 |     data = np.loadtxt(F)
 50 |     return data
 51 | 
 52 | 
 53 | Xref = fetch_vega_spectrum()
 54 | Xref[:, 1] /= 2.1 * Xref[:, 1].max()
 55 | 
 56 | #----------------------------------------------------------------------
 57 | # Plot filters in color with a single spectrum
 58 | pl.figure()
 59 | pl.plot(Xref[:, 0], Xref[:, 1], '-k', lw=2)
 60 | 
 61 | for f,c in zip('ugriz', 'bgrmk'):
 62 |     X = fetch_filter(f)
 63 |     pl.fill(X[:, 0], X[:, 1], ec=c, fc=c, alpha=0.4)
 64 | 
 65 | kwargs = dict(fontsize=20, ha='center', va='center', alpha=0.5)
 66 | pl.text(3500, 0.02, 'u', color='b', **kwargs)
 67 | pl.text(4600, 0.02, 'g', color='g', **kwargs)
 68 | pl.text(6100, 0.02, 'r', color='r', **kwargs)
 69 | pl.text(7500, 0.02, 'i', color='m', **kwargs)
 70 | pl.text(8800, 0.02, 'z', color='k', **kwargs)
 71 | 
 72 | pl.xlim(3000, 11000)
 73 | 
 74 | pl.title('SDSS Filters and Reference Spectrum')
 75 | pl.xlabel('Wavelength (Angstroms)')
 76 | pl.ylabel('normalized flux / filter transmission')
 77 | 
 78 | #----------------------------------------------------------------------
 79 | # Plot filters in gray with several redshifted spectra
 80 | pl.figure()
 81 | 
 82 | redshifts = [0.0, 0.4, 0.8]
 83 | colors = 'bgr'
 84 | 
 85 | for z, c in zip(redshifts, colors):
 86 |     pl.plot((1. + z) * Xref[:, 0], Xref[:, 1], color=c)
 87 | 
 88 | pl.gca().add_patch(Arrow(4200, 0.47, 1300, 0, lw=0, width=0.05, color='r'))
 89 | pl.gca().add_patch(Arrow(5800, 0.47, 1250, 0, lw=0, width=0.05, color='r'))
 90 | 
 91 | pl.text(3800, 0.49, 'z = 0.0', fontsize=14, color=colors[0])
 92 | pl.text(5500, 0.49, 'z = 0.4', fontsize=14, color=colors[1])
 93 | pl.text(7300, 0.49, 'z = 0.8', fontsize=14, color=colors[2])
 94 | 
 95 | for f in 'ugriz':
 96 |     X = fetch_filter(f)
 97 |     pl.fill(X[:, 0], X[:, 1], ec='k', fc='k', alpha=0.2)
 98 | 
 99 | kwargs = dict(fontsize=20, color='gray', ha='center', va='center')
100 | pl.text(3500, 0.02, 'u', **kwargs)
101 | pl.text(4600, 0.02, 'g', **kwargs)
102 | pl.text(6100, 0.02, 'r', **kwargs)
103 | pl.text(7500, 0.02, 'i', **kwargs)
104 | pl.text(8800, 0.02, 'z', **kwargs)
105 | 
106 | pl.xlim(3000, 11000)
107 | pl.ylim(0, 0.55)
108 | 
109 | pl.title('Redshifting of a Spectrum')
110 | pl.xlabel('Observed Wavelength (Angstroms)')
111 | pl.ylabel('normalized flux / filter transmission')
112 | 
113 | pl.show()
114 | 


--------------------------------------------------------------------------------
/examples/plot_sdss_images.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SDSS Images
 3 | -----------
 4 | 
 5 | This script plots an example quasar, star, and galaxy image for use in
 6 | the tutorial.
 7 | """
 8 | import os
 9 | import urllib2
10 | 
11 | import pylab as pl
12 | from matplotlib import image
13 | 
14 | def _fetch(outfile, RA, DEC, scale=0.2, width=400, height=400):
15 |     """Fetch the image at the given RA, DEC from the SDSS server"""
16 |     url = ("http://casjobs.sdss.org/ImgCutoutDR7/"
17 |            "getjpeg.aspx?ra=%.8f&dec=%.8f&scale=%.2f&width=%i&height=%i"
18 |            % (RA, DEC, scale, width, height))
19 |     print "downloading %s" % url
20 |     print " -> %s" % outfile
21 |     fhandle = urllib2.urlopen(url)
22 |     open(outfile, 'w').write(fhandle.read())
23 | 
24 | 
25 | def fetch_image(object_type):
26 |     """Return the data array for the image of object type"""
27 |     if not os.path.exists('downloads'):
28 |         os.makedirs('downloads')
29 | 
30 |     filename = os.path.join('downloads', '%s_image.jpg' % object_type)
31 |     if not os.path.exists(filename):
32 |         RA = image_locations[object_type]['RA']
33 |         DEC = image_locations[object_type]['DEC']
34 |         _fetch(filename, RA, DEC)
35 |     
36 |     return image.imread(filename)
37 | 
38 | 
39 | image_locations = dict(star=dict(RA=180.63040108,
40 |                                  DEC=64.96767375),
41 |                        galaxy=dict(RA=197.51943983,
42 |                                    DEC=0.94881436),
43 |                        quasar=dict(RA=226.18451462,
44 |                                    DEC=4.07456639))
45 | 
46 | 
47 | # Plot the images
48 | fig = pl.figure(figsize=(9, 3))
49 | 
50 | # Check that PIL is installed for jpg support
51 | if 'jpg' not in fig.canvas.get_supported_filetypes():
52 |     raise ValueError("PIL required to load SDSS jpeg images")
53 | 
54 | object_types = ['star', 'galaxy', 'quasar']
55 | 
56 | for i, object_type in enumerate(object_types):
57 |     ax = pl.subplot(131 + i, xticks=[], yticks=[])
58 |     I = fetch_image(object_type)
59 |     ax.imshow(I)
60 |     if object_type != 'galaxy':
61 |         pl.arrow(0.65, 0.65, -0.1, -0.1, width=0.005, head_width=0.03,
62 |                  length_includes_head=True,
63 |                  color='w', transform=ax.transAxes)
64 |     pl.text(0.99, 0.01, object_type, fontsize='large', color='w', ha='right',
65 |             transform=ax.transAxes)
66 | 
67 | pl.subplots_adjust(bottom=0.04, top=0.94, left=0.02, right=0.98, wspace=0.04)
68 | 
69 | pl.show()
70 | 


--------------------------------------------------------------------------------
/examples/plot_sdss_photoz.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SDSS Photometric Redshifts
 3 | --------------------------
 4 | 
 5 | This example shows how a Decision tree can be used to learn redshifts
 6 | of galaxies in the Sloan Digital Sky Survey.
 7 | """
 8 | 
 9 | import os
10 | import urllib2
11 | import numpy as np
12 | import pylab as pl
13 | 
14 | from sklearn.datasets import get_data_home
15 | from sklearn.tree import DecisionTreeRegressor
16 | 
17 | DATA_URL = ('http://www.astro.washington.edu/users/'
18 |             'vanderplas/pydata/sdss_photoz.npy')
19 | LOCAL_FILE = 'sdss_photoz.npy'
20 | 
21 | def fetch_photoz_data():
22 |     if not os.path.exists('downloads'):
23 |         os.makedirs('downloads')
24 | 
25 |     local_file = os.path.join('downloads', LOCAL_FILE)
26 | 
27 |     if not os.path.exists(local_file):
28 |         # data directory is password protected so the public can't access it    
29 |         password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
30 |         password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
31 |         handler = urllib2.HTTPBasicAuthHandler(password_mgr)
32 |         opener = urllib2.build_opener(handler)
33 |         fhandle = opener.open(DATA_URL)
34 |         open(local_file, 'w').write(fhandle.read())
35 |     
36 |     return np.load(local_file)
37 | 
38 | data = fetch_photoz_data()
39 | 
40 | N = len(data)
41 | 
42 | # put colors in a matrix
43 | X = np.zeros((N, 4))
44 | X[:, 0] = data['u'] - data['g']
45 | X[:, 1] = data['g'] - data['r']
46 | X[:, 2] = data['r'] - data['i']
47 | X[:, 3] = data['i'] - data['z']
48 | z = data['redshift']
49 | 
50 | # divide into training and testing data
51 | Ntrain = 3 * N / 4
52 | Xtrain = X[:Ntrain]
53 | ztrain = z[:Ntrain]
54 | 
55 | Xtest = X[Ntrain:]
56 | ztest = z[Ntrain:]
57 | 
58 | 
59 | clf = DecisionTreeRegressor(max_depth=20)
60 | clf.fit(Xtrain, ztrain)
61 | zpred = clf.predict(Xtest)
62 | 
63 | axis_lim = np.array([-0.1, 2.5])
64 | 
65 | rms = np.sqrt(np.mean((ztest - zpred) ** 2))
66 | print rms
67 | print len(ztest)
68 | print np.sum(abs(ztest - zpred) > 1)
69 | 
70 | ax = pl.axes()
71 | pl.scatter(ztest, zpred, c='k', lw=0, s=4)
72 | pl.plot(axis_lim, axis_lim, '--k')
73 | pl.plot(axis_lim, axis_lim + rms, ':k')
74 | pl.plot(axis_lim, axis_lim - rms, ':k')
75 | pl.xlim(axis_lim)
76 | pl.ylim(axis_lim)
77 | 
78 | pl.text(0.99, 0.02, "RMS error = %.2g" % rms,
79 |         ha='right', va='bottom', transform=ax.transAxes,
80 |         bbox=dict(ec='w', fc='w'), fontsize=16)
81 | 
82 | pl.title('Photo-z: Decision Tree Regression')
83 | pl.xlabel(r'$\mathrm{z_{true}}$', fontsize=14)
84 | pl.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14)
85 | pl.show()
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/examples/plot_sdss_specPCA.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SDSS Spectra Plots
  3 | ------------------
  4 | 
  5 | This plots some of the SDSS spectra examples for the astronomy tutorial
  6 | """
  7 | import os
  8 | import urllib2
  9 | 
 10 | import numpy as np
 11 | import pylab as pl
 12 | 
 13 | from sklearn import preprocessing
 14 | from sklearn.decomposition import RandomizedPCA
 15 | 
 16 | DATA_URL = ('http://www.astro.washington.edu/users/'
 17 |             'vanderplas/pydata/spec4000_corrected.npz')
 18 | 
 19 | def fetch_sdss_spec_data():
 20 |     if not os.path.exists('downloads'):
 21 |         os.makedirs('downloads')
 22 | 
 23 |     local_file = os.path.join('downloads', os.path.basename(DATA_URL))
 24 | 
 25 |     # data directory is password protected so the public can't access it    
 26 |     password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
 27 |     password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
 28 |     handler = urllib2.HTTPBasicAuthHandler(password_mgr)
 29 |     opener = urllib2.build_opener(handler)
 30 | 
 31 |     # download training data
 32 |     if not os.path.exists(local_file):
 33 |         fhandle = opener.open(DATA_URL)
 34 |         open(local_file, 'w').write(fhandle.read())
 35 | 
 36 |     return np.load(local_file)
 37 | 
 38 | #----------------------------------------------------------------------
 39 | #
 40 | #  Load the data
 41 | data = fetch_sdss_spec_data()
 42 | 
 43 | wavelengths = data['wavelengths']
 44 | X = data['X']
 45 | y = data['y']
 46 | labels = data['labels']
 47 | 
 48 | from matplotlib.ticker import FuncFormatter
 49 | format = FuncFormatter(lambda i, *args: labels[i].replace(' ', '\n'))
 50 | 
 51 | #----------------------------------------------------------------------
 52 | #
 53 | #  Plot the first few spectra, offset so they don't overlap
 54 | #
 55 | pl.figure()
 56 | 
 57 | for i_class in (2, 3, 4, 5, 6):
 58 |     i = np.where(y == i_class)[0][0]
 59 |     l = pl.plot(wavelengths, X[i] + 20 * i_class)
 60 |     c = l[0].get_color()
 61 |     pl.text(6800, 2 + 20 * i_class, labels[i_class], color=c)
 62 |             
 63 | pl.subplots_adjust(hspace=0)
 64 | pl.xlabel('wavelength (Angstroms)')
 65 | pl.ylabel('flux + offset')
 66 | pl.title('Sample of Spectra')
 67 | 
 68 | #----------------------------------------------------------------------
 69 | #
 70 | #  Plot the mean spectrum
 71 | #
 72 | X = preprocessing.normalize(X, 'l2')
 73 | 
 74 | pl.figure()
 75 | 
 76 | mu = X.mean(0)
 77 | std = X.std(0)
 78 | 
 79 | pl.plot(wavelengths, mu, color='black')
 80 | pl.fill_between(wavelengths, mu - std, mu + std, color='#CCCCCC')
 81 | pl.xlim(wavelengths[0], wavelengths[-1])
 82 | pl.ylim(0, 0.06)
 83 | pl.xlabel('wavelength (Angstroms)')
 84 | pl.ylabel('scaled flux')
 85 | pl.title('Mean Spectrum + Variance')
 86 | 
 87 | #----------------------------------------------------------------------
 88 | #
 89 | #  Plot a random pair of digits
 90 | #
 91 | pl.figure()
 92 | np.random.seed(25255)
 93 | i1, i2 = np.random.randint(1000, size=2)
 94 | 
 95 | pl.scatter(X[:, i1], X[:, i2], c=y, s=4, lw=0,
 96 |            vmin=2, vmax=6, cmap=pl.cm.jet)
 97 | pl.colorbar(ticks = range(2, 7), format=format)
 98 | pl.xlabel('wavelength = %.1f' % wavelengths[i1])
 99 | pl.ylabel('wavelength = %.1f' % wavelengths[i2])
100 | pl.title('Random Pair of Spectra Bins')
101 | 
102 | #----------------------------------------------------------------------
103 | #
104 | #  Perform PCA
105 | #
106 | 
107 | rpca = RandomizedPCA(n_components=4, random_state=0)
108 | X_proj = rpca.fit_transform(X)
109 | 
110 | #----------------------------------------------------------------------
111 | #
112 | #  Plot PCA components
113 | #
114 | 
115 | pl.figure()
116 | pl.scatter(X_proj[:, 0], X_proj[:, 1], c=y, s=4, lw=0,
117 |            vmin=2, vmax=6, cmap=pl.cm.jet)
118 | pl.colorbar(ticks = range(2, 7), format=format)
119 | pl.xlabel('coefficient 1')
120 | pl.ylabel('coefficient 2')
121 | pl.title('PCA projection of Spectra')
122 | 
123 | #----------------------------------------------------------------------
124 | #
125 | #  Plot PCA eigenspectra
126 | #
127 | 
128 | pl.figure()
129 | 
130 | l = pl.plot(wavelengths, rpca.mean_ - 0.15)
131 | c = l[0].get_color()
132 | pl.text(7000, -0.16, "mean" % i, color=c)
133 | 
134 | for i in range(4):
135 |     l = pl.plot(wavelengths, rpca.components_[i] + 0.15 * i)
136 |     c = l[0].get_color()
137 |     pl.text(7000, -0.01 + 0.15 * i, "component %i" % (i + 1), color=c)
138 | pl.ylim(-0.2, 0.6)
139 | pl.xlabel('wavelength (Angstroms)')
140 | pl.ylabel('scaled flux + offset')
141 | pl.title('Mean Spectrum and Eigen-spectra')
142 | 
143 | pl.show()
144 | 


--------------------------------------------------------------------------------
/examples/plot_sgd_separating_hyperplane.py:
--------------------------------------------------------------------------------
 1 | """
 2 | =========================================
 3 | SGD: Maximum margin separating hyperplane
 4 | =========================================
 5 | 
 6 | Plot the maximum margin separating hyperplane within a two-class
 7 | separable dataset using a linear Support Vector Machines classifier
 8 | trained using SGD.
 9 | """
10 | print __doc__
11 | 
12 | import numpy as np
13 | import pylab as pl
14 | from sklearn.linear_model import SGDClassifier
15 | from sklearn.datasets.samples_generator import make_blobs
16 | 
17 | # we create 50 separable points
18 | X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)
19 | 
20 | # fit the model
21 | clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True)
22 | clf.fit(X, Y)
23 | 
24 | # plot the line, the points, and the nearest vectors to the plane
25 | xx = np.linspace(-1, 5, 10)
26 | yy = np.linspace(-1, 5, 10)
27 | 
28 | X1, X2 = np.meshgrid(xx, yy)
29 | Z = np.empty(X1.shape)
30 | for (i, j), val in np.ndenumerate(X1):
31 |     x1 = val
32 |     x2 = X2[i, j]
33 |     p = clf.decision_function([x1, x2])
34 |     Z[i, j] = p[0]
35 | levels = [-1.0, 0.0, 1.0]
36 | linestyles = ['dashed', 'solid', 'dashed']
37 | colors = 'k'
38 | pl.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
39 | pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
40 | 
41 | pl.axis('tight')
42 | pl.show()
43 | 


--------------------------------------------------------------------------------