├── .gitignore ├── AUTHORS.rst ├── COPYING ├── Makefile ├── README.rst ├── doc ├── .gitignore ├── Makefile ├── README ├── classification.rst ├── conf.py ├── data │ ├── sdss_colors │ │ ├── fetch_data.py │ │ └── scatter_colors.py │ ├── sdss_photoz │ │ └── fetch_data.py │ └── sdss_spectra │ │ └── fetch_data.py ├── dimensionality_reduction.rst ├── exercises.rst ├── general_concepts.rst ├── images │ └── blank_image.png ├── includes │ ├── big_toc_css.rst │ └── bigger_toc_css.rst ├── index.rst ├── logos │ ├── favicon.ico │ ├── identity.pdf │ ├── scikit-learn-logo-small.png │ ├── scikit-learn-logo-thumb.png │ ├── scikit-learn-logo.bmp │ ├── scikit-learn-logo.png │ └── scikit-learn-logo.svg ├── make.bat ├── notebooks │ ├── .gitignore │ ├── 01_datasets.ipynb │ ├── 02_iris_classification.ipynb │ ├── 03_iris_dimensionality.ipynb │ ├── 04_iris_clustering.ipynb │ ├── 05_iris_crossval.ipynb │ ├── 06_learning_curves.ipynb │ ├── 07_classification_example.ipynb │ ├── 08_regression_example.ipynb │ ├── 09_dimensionality_example.ipynb │ ├── 10_exercise01.ipynb │ ├── 11_exercise02.ipynb │ ├── 12_exercise03.ipynb │ ├── nbconvert.py │ └── soln │ │ ├── 01-01.py │ │ ├── 01-02.py │ │ ├── 01-03.py │ │ ├── 01-04.py │ │ ├── 01-05.py │ │ ├── 02-01.py │ │ ├── 02-02.py │ │ ├── 02-03a.py │ │ ├── 02-03b.py │ │ ├── 03-01.py │ │ ├── 03-02.py │ │ └── 03-03.py ├── practical.rst ├── regression.rst ├── scikitlearn.png ├── setup.rst ├── skeletons │ ├── exercise_01.py │ ├── exercise_02.py │ └── exercise_03.py ├── solutions │ ├── exercise_01.py │ ├── exercise_02.py │ ├── exercise_03.py │ └── generate_skeletons.py ├── sphinxext │ ├── LICENSE.txt │ ├── MANIFEST.in │ ├── README.txt │ ├── gen_rst.py │ ├── numpy_ext │ │ ├── __init__.py │ │ ├── docscrape.py │ │ ├── docscrape_sphinx.py │ │ └── numpydoc.py │ └── numpy_ext_old │ │ ├── __init__.py │ │ ├── docscrape.py │ │ ├── docscrape_sphinx.py │ │ └── numpydoc.py ├── templates │ ├── class.rst │ └── function.rst └── themes │ └── scikit-learn │ ├── layout.html │ ├── static │ ├── jquery.js │ ├── nature.css_t │ └── sidebar.js │ └── theme.conf └── examples ├── README.txt ├── plot_ML_flow_chart.py ├── plot_bias_variance_examples.py ├── plot_gui_example.py ├── plot_iris_projections.py ├── plot_python_101.py ├── plot_sdss_filters.py ├── plot_sdss_images.py ├── plot_sdss_photoz.py ├── plot_sdss_specPCA.py ├── plot_sgd_separating_hyperplane.py └── svm_gui.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *~ 4 | .#* 5 | *.swp 6 | *.swo 7 | .DS_Store 8 | build 9 | 10 | dist/ 11 | doc/.nojekyll 12 | doc/_build/ 13 | doc/auto_examples/ 14 | doc/modules/generated/ 15 | doc/datasets/generated/ 16 | pip-log.txt 17 | .coverage 18 | coverage 19 | tags 20 | 21 | examples/downloads/ 22 | 23 | *.zip 24 | *.nt.bz2 25 | *.tar.gz 26 | *.tgz 27 | *.npz 28 | *.npy -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | People 4 | ------ 5 | 6 | This tutorial is brought to you by the `scikit-learn 7 | `_ folks, in particular: 8 | 9 | .. hlist:: 10 | 11 | * `Jake Vanderplas `_ 12 | * Olivier Grisel 13 | * Jaques Grobler 14 | * `Gael Varoquaux `_ 15 | 16 | .. _citing: 17 | 18 | Citing the scikit-learn 19 | ------------------------ 20 | 21 | A huge amount of work goes in the scikit-learn. Researchers that invest 22 | their time in developing and maintaining the package deserve recognition 23 | with citations. In addition, the Parietal team needs the citations to the 24 | paper in order to justify paying a software engineer on the project. To 25 | garanty the future of the toolkit, if you use it, please cite it. 26 | 27 | See the scikit-learn documentation on `how to cite 28 | `_. 29 | 30 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2007 - 2012 The scikit-learn developers. 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the Scikit-learn Developers nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # simple makefile to simplify repetetive build env management tasks under posix 2 | 3 | # caution: testing won't work on windows, see README 4 | 5 | PYTHON ?= python 6 | CYTHON ?= cython 7 | NOSETESTS ?= nosetests 8 | CTAGS ?= ctags 9 | 10 | all: clean doc-noplot 11 | 12 | clean-pyc: 13 | find . -name "*.pyc" | xargs rm -f 14 | 15 | clean-so: 16 | find . -name "*.so" | xargs rm -f 17 | find . -name "*.pyd" | xargs rm -f 18 | 19 | clean-build: 20 | rm -rf build 21 | 22 | clean-ctags: 23 | rm -f tags 24 | 25 | clean: clean-build clean-pyc clean-so clean-ctags 26 | 27 | in: inplace # just a shortcut 28 | inplace: 29 | $(PYTHON) setup.py build_ext -i 30 | 31 | test-doc: 32 | $(NOSETESTS) -s --with-doctest --doctest-tests --doctest-extension=rst \ 33 | --doctest-extension=inc --doctest-fixtures=_fixture doc/ \ 34 | 35 | test: test-doc 36 | 37 | trailing-spaces: 38 | find . -name "*.py" | xargs perl -pi -e 's/[ \t]*$$//' 39 | 40 | cython: 41 | find -name "*.pyx" | xargs $(CYTHON) 42 | 43 | ctags: 44 | # make tags for symbol based navigation in emacs and vim 45 | # Install with: sudo apt-get install exuberant-ctags 46 | $(CTAGS) -R * 47 | 48 | .PHONY : doc 49 | doc: 50 | make -C doc html 51 | 52 | .PHONY : doc-noplot 53 | doc-noplot: 54 | make -C doc html-noplot 55 | 56 | .PHONY : pdf 57 | pdf: 58 | make -C doc pdf 59 | 60 | install: 61 | cd doc; make install 62 | 63 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Machine Learning for Astronomical Data Analysis 4 | ================================================= 5 | 6 | **Note: this content is extremely out-of-date, and I would not recommend using it** 7 | 8 | If you would like a more up-to-date machine learning tutorial that grew from this 9 | content, I'd recommend the [Python Data Science Handbook](http://github.com/jakevdp/PythonDataScienceHandbook). 10 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | AUTHORS.rst -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex latexpdf chan 16 | 17 | all: html-noplot 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 28 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 29 | @echo " changes to make an overview of all changed/added/deprecated items" 30 | @echo " linkcheck to check all external links for integrity" 31 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 32 | 33 | clean: 34 | -rm -rf $(BUILDDIR)/* 35 | -rm -rf auto_examples/ 36 | -rm -rf doc/generated/* 37 | -rm -rf modules/generated/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | touch $(BUILDDIR)/html .nojekyll 42 | @echo 43 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 44 | 45 | html-noplot: 46 | $(SPHINXBUILD) -D plot_gallery=False -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 47 | touch $(BUILDDIR)/html .nojekyll 48 | @echo 49 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 50 | 51 | dirhtml: 52 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 53 | touch $(BUILDDIR)/dirhtml .nojekyll 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 56 | 57 | pickle: 58 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 59 | @echo 60 | @echo "Build finished; now you can process the pickle files." 61 | 62 | json: 63 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 64 | @echo 65 | @echo "Build finished; now you can process the JSON files." 66 | 67 | htmlhelp: 68 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 69 | @echo 70 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 71 | ".hhp project file in $(BUILDDIR)/htmlhelp." 72 | 73 | latex: 74 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 75 | @echo 76 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 77 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 78 | "(use \`make latexpdf' here to do that automatically)." 79 | 80 | latexpdf: 81 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 82 | @echo "Running LaTeX files through pdflatex..." 83 | make -C $(BUILDDIR)/latex all-pdf 84 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 85 | 86 | changes: 87 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 88 | @echo 89 | @echo "The overview file is in $(BUILDDIR)/changes." 90 | 91 | linkcheck: 92 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 93 | @echo 94 | @echo "Link check complete; look for any errors in the above output " \ 95 | "or in $(BUILDDIR)/linkcheck/output.txt." 96 | 97 | doctest: 98 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 99 | @echo "Testing of doctests in the sources finished, look at the " \ 100 | "results in $(BUILDDIR)/doctest/output.txt." 101 | 102 | #zip: html pdf 103 | # mkdir -p _build/nisl ; 104 | # cp -r _build/html _build/nisl ; 105 | # cp -r data _build/nisl ; 106 | # cp nisl.pdf _build/nisl; 107 | # zip -r _build/nisl.zip _build/nisl 108 | 109 | download-data: 110 | cd data/sdss_colors && python fetch_data.py 111 | cd data/sdss_colors && python fetch_data.py 112 | cd data/sdss_colors && python fetch_data.py 113 | 114 | pdf: latexpdf 115 | cp $(BUILDDIR)/latex/sklearn_tutorial.pdf ./ 116 | 117 | nbconvert: 118 | cd notebooks && rm -f *.v2.ipynb && python nbconvert.py *.ipynb 119 | 120 | tar: nbconvert 121 | tar -czvf exercises.tgz notebooks/soln/*.py notebooks/*.ipynb data/*/*.py skeletons solutions 122 | 123 | tar-data: nbconvert download-data 124 | tar -czvf exercises_data.tgz notebooks/soln/*.py notebooks/*.ipynb data skeletons solutions 125 | 126 | install-reclone: pdf tar html 127 | rm -rf _build/sklearn_tutorial 128 | cd _build/ && \ 129 | git clone git@github.com:astroML/sklearn_tutorial && \ 130 | cd sklearn_tutorial && git checkout gh-pages && \ 131 | rsync -r ../html/* . && \ 132 | git add * && \ 133 | git commit -a -m 'Make install' && \ 134 | git push origin gh-pages 135 | 136 | install: pdf tar html 137 | if test -d _build/sklearn_tutorial; \ 138 | then echo "using existing sklearn_tutorial directory"; \ 139 | else cd _build && \ 140 | git clone git@github.com:astroML/sklearn_tutorial; \ 141 | fi && \ 142 | cd _build/sklearn_tutorial && git checkout gh-pages && \ 143 | rsync -r ../html/* ./ && \ 144 | git add * && \ 145 | git commit -a -m 'Make install' && \ 146 | git push origin gh-pages 147 | -------------------------------------------------------------------------------- /doc/README: -------------------------------------------------------------------------------- 1 | Documentation 2 | ---------------------- 3 | 4 | This section contains the full manual and web page as displayed on 5 | the web. To generate the full web page, including 6 | the example gallery (this might take a while): 7 | 8 | make html 9 | 10 | Or, if you'd rather not build the example gallery: 11 | 12 | make html-noplot 13 | 14 | That should create all the doc in directory _build/html 15 | 16 | To build the PDF manual, run 17 | 18 | make latexpdf 19 | 20 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # scikit-learn documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Jan 8 09:13:42 2010. 5 | # 6 | # This file is execfile()d with the current directory set to its containing 7 | # dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import shutil 18 | 19 | # If extensions (or modules to document with autodoc) are in another 20 | # directory, add these directories to sys.path here. If the directory 21 | # is relative to the documentation root, use os.path.abspath to make it 22 | # absolute, like shown here. 23 | sys.path.insert(0, os.path.abspath('sphinxext')) 24 | 25 | try: 26 | shutil.copy('../AUTHORS.rst', '.') 27 | except IOError: 28 | # When nose scans this file, it is not in the right working 29 | # directory, and thus the line above fails 30 | pass 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # Try to override the matplotlib configuration as early as possible 35 | try: 36 | import gen_rst 37 | except: 38 | pass 39 | 40 | # Add any Sphinx extension module names here, as strings. They can be 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 42 | extensions = ['gen_rst', 43 | 'sphinx.ext.autodoc', #'sphinx.ext.autosummary', 44 | 'sphinx.ext.pngmath', 'sphinx.ext.intersphinx', 45 | ] 46 | try: 47 | import numpy_ext.numpydoc 48 | extensions.append('numpy_ext.numpydoc') 49 | # With older versions of sphinx, this causes a crash 50 | autosummary_generate = True 51 | except: 52 | # Older version of sphinx 53 | extensions.append('numpy_ext_old.numpydoc') 54 | 55 | autodoc_default_flags = ['members', 'inherited-members'] 56 | 57 | # Add any paths that contain templates here, relative to this directory. 58 | templates_path = ['templates'] 59 | 60 | # generate autosummary even if no references 61 | autosummary_generate = True 62 | 63 | # The suffix of source filenames. 64 | source_suffix = '.rst' 65 | 66 | # The encoding of source files. 67 | #source_encoding = 'utf-8' 68 | 69 | # Generate the plots for the gallery 70 | plot_gallery = True 71 | 72 | # The master toctree document. 73 | master_doc = 'index' 74 | 75 | # General information about the project. 76 | project = u'AtroML' 77 | copyright = u'scikit-learn developers' 78 | 79 | # The version info for the project you're documenting, acts as replacement for 80 | # |version| and |release|, also used in various other places throughout the 81 | # built documents. 82 | # 83 | # The short X.Y version. 84 | version = '' 85 | # The full version, including alpha/beta/rc tags. 86 | release = "Scipy2012" 87 | 88 | # The language for content autogenerated by Sphinx. Refer to documentation 89 | # for a list of supported languages. 90 | language = 'en' 91 | 92 | # There are two options for replacing |today|: either, you set today to some 93 | # non-false value, then it is used: 94 | #today = '' 95 | # Else, today_fmt is used as the format for a strftime call. 96 | #today_fmt = '%B %d, %Y' 97 | 98 | # List of documents that shouldn't be included in the build. 99 | #unused_docs = [] 100 | 101 | # List of directories, relative to source directory, that shouldn't be 102 | # searched for source files. 103 | exclude_trees = ['_build', 'templates', 'includes'] 104 | 105 | # The reST default role (used for this markup: `text`) to use for all 106 | # documents. 107 | #default_role = None 108 | 109 | # If true, '()' will be appended to :func: etc. cross-reference text. 110 | add_function_parentheses = False 111 | 112 | # If true, the current module name will be prepended to all description 113 | # unit titles (such as .. function::). 114 | #add_module_names = True 115 | 116 | # If true, sectionauthor and moduleauthor directives will be shown in the 117 | # output. They are ignored by default. 118 | #show_authors = False 119 | 120 | # The name of the Pygments (syntax highlighting) style to use. 121 | pygments_style = 'sphinx' 122 | 123 | # A list of ignored prefixes for module index sorting. 124 | #modindex_common_prefix = [] 125 | 126 | 127 | # -- Options for HTML output ------------------------------------------------- 128 | 129 | # The theme to use for HTML and HTML Help pages. Major themes that come with 130 | # Sphinx are currently 'default' and 'sphinxdoc'. 131 | html_theme = 'scikit-learn' 132 | 133 | # Theme options are theme-specific and customize the look and feel of a theme 134 | # further. For a list of options available for each theme, see the 135 | # documentation. 136 | html_theme_options = {'oldversion':False, 'collapsiblesidebar': True} 137 | 138 | # Add any paths that contain custom themes here, relative to this directory. 139 | html_theme_path = ['themes'] 140 | 141 | 142 | # The name for this set of Sphinx documents. If None, it defaults to 143 | # " v documentation". 144 | html_title = "Machine Learning for Astronomy with Scikit-learn" 145 | 146 | # A shorter title for the navigation bar. Default is the same as html_title. 147 | html_short_title = 'Scikit-learn Astronomy Tutorial' 148 | 149 | # The name of an image file (relative to this directory) to place at the top 150 | # of the sidebar. 151 | html_logo = 'logos/scikit-learn-logo-small.png' 152 | 153 | # The name of an image file (within the static path) to use as favicon of the 154 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 155 | # pixels large. 156 | html_favicon = 'logos/favicon.ico' 157 | 158 | # Add any paths that contain custom static files (such as style sheets) here, 159 | # relative to this directory. They are copied after the builtin static files, 160 | # so a file named "default.css" will overwrite the builtin "default.css". 161 | html_static_path = ['images'] 162 | 163 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 164 | # using the given strftime format. 165 | #html_last_updated_fmt = '%b %d, %Y' 166 | 167 | # If true, SmartyPants will be used to convert quotes and dashes to 168 | # typographically correct entities. 169 | #html_use_smartypants = True 170 | 171 | # Custom sidebar templates, maps document names to template names. 172 | #html_sidebars = {} 173 | 174 | # Additional templates that should be rendered to pages, maps page names to 175 | # template names. 176 | #html_additional_pages = {} 177 | 178 | # If false, no module index is generated. 179 | html_use_modindex = False 180 | 181 | # If false, no index is generated. 182 | html_use_index = False 183 | 184 | # If true, the index is split into individual pages for each letter. 185 | #html_split_index = False 186 | 187 | # If true, links to the reST sources are added to the pages. 188 | #html_show_sourcelink = True 189 | 190 | # If true, an OpenSearch description file will be output, and all pages will 191 | # contain a tag referring to it. The value of this option must be the 192 | # base URL from which the finished HTML is served. 193 | #html_use_opensearch = '' 194 | 195 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 196 | #html_file_suffix = '' 197 | 198 | # Output file base name for HTML help builder. 199 | htmlhelp_basename = 'PythonScientic' 200 | 201 | 202 | # -- Options for LaTeX output ------------------------------------------------ 203 | 204 | # The paper size ('letter' or 'a4'). 205 | #latex_paper_size = 'letter' 206 | 207 | # The font size ('10pt', '11pt' or '12pt'). 208 | #latex_font_size = '10pt' 209 | 210 | # Grouping the document tree into LaTeX files. List of tuples 211 | # (source start file, target name, title, author, documentclass 212 | # [howto/manual]). 213 | latex_documents = [ 214 | ('index', 'sklearn_tutorial.tex', u'Astronomy with scikit-learn', 215 | ur"""Jacob VanderPlas""" 216 | + r"\\\relax ~\\\relax http://astroML.github.com/sklearn\_tutorial/", 217 | 'manual'), 218 | ] 219 | 220 | # The name of an image file (relative to this directory) to place at the top of 221 | # the title page. 222 | latex_logo = "logos/scikit-learn-logo.png" 223 | 224 | # For "manual" documents, if this is true, then toplevel headings are parts, 225 | # not chapters. 226 | #latex_use_parts = False 227 | 228 | # Additional stuff for the LaTeX preamble. 229 | latex_preamble = r""" 230 | \usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm}\usepackage{morefloats} 231 | \let\oldfootnote\footnote 232 | \def\footnote#1{\oldfootnote{\small #1}} 233 | """ 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #latex_appendices = [] 237 | latex_elements = { 238 | 'classoptions': ',oneside', 239 | 'babel': '\\usepackage[english]{babel}', 240 | # Get completely rid of index 241 | 'printindex': '', 242 | } 243 | 244 | # If false, no module index is generated. 245 | latex_use_modindex = False 246 | latex_domain_indices = False 247 | 248 | # Show the page numbers in the references 249 | latex_show_pagerefs = True 250 | 251 | # Show URLs in footnotes 252 | latex_show_urls = 'footnote' 253 | 254 | trim_doctests_flags = True 255 | 256 | # Intersphinx mapping to the scikit-learn docs 257 | intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)} 258 | -------------------------------------------------------------------------------- /doc/data/sdss_colors/fetch_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib2 3 | import numpy as np 4 | 5 | DTYPE_TRAIN = [('u-g', np.float32), 6 | ('g-r', np.float32), 7 | ('r-i', np.float32), 8 | ('i-z', np.float32), 9 | ('redshift', np.float32)] 10 | 11 | DTYPE_TEST = [('u-g', np.float32), 12 | ('g-r', np.float32), 13 | ('r-i', np.float32), 14 | ('i-z', np.float32), 15 | ('label', np.int32)] 16 | 17 | SDSS_COLORS_URL = "http://www.astro.washington.edu/users/vanderplas/pydata/" 18 | TRAIN_FILE = 'sdssdr6_colors_class_train.dat' 19 | TEST_FILE = 'sdssdr6_colors_class.200000.dat' 20 | 21 | # data directory is password protected so the public can't access it 22 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() 23 | password_mgr.add_password(None, SDSS_COLORS_URL, 'pydata', 'astroML') 24 | handler = urllib2.HTTPBasicAuthHandler(password_mgr) 25 | opener = urllib2.build_opener(handler) 26 | 27 | # download training data 28 | destination = TRAIN_FILE.rstrip('.dat') + '.npy' 29 | if not os.path.exists(destination): 30 | url = SDSS_COLORS_URL + TRAIN_FILE 31 | print "downloading data from", url 32 | fhandle = opener.open(url) 33 | np.save(destination, np.loadtxt(opener.open(url), dtype=DTYPE_TRAIN)) 34 | 35 | # download test data 36 | destination = TEST_FILE.rstrip('.dat') + '.npy' 37 | if not os.path.exists(destination): 38 | url = SDSS_COLORS_URL + TEST_FILE 39 | print "downloading data from", url 40 | fhandle = opener.open(url) 41 | np.save(destination, np.loadtxt(opener.open(url), dtype=DTYPE_TEST)) 42 | 43 | -------------------------------------------------------------------------------- /doc/data/sdss_colors/scatter_colors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pylab as pl 3 | 4 | data = np.load('sdssdr6_colors_class_train.npy') 5 | 6 | # only plot 10000 points: otherwise it takes too much memory 7 | np.random.shuffle(data) 8 | data = data[:10000] 9 | 10 | redshift = data['redshift'] 11 | 12 | print "%i qsos" % np.sum(redshift > 0) 13 | print "%i stars" % np.sum(redshift == 0) 14 | 15 | kwargs = dict(s=1, c=(redshift > 0), lw=0) 16 | 17 | pl.figure(figsize=(6, 8)) 18 | 19 | pl.subplot(311).scatter(data['u-g'], data['g-r'], **kwargs) 20 | 21 | pl.subplot(312).scatter(data['g-r'], data['r-i'], **kwargs) 22 | 23 | pl.subplot(313).scatter(data['r-i'], data['i-z'], **kwargs) 24 | 25 | pl.show() 26 | -------------------------------------------------------------------------------- /doc/data/sdss_photoz/fetch_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file fetches photometric observations associated with SDSS galaxy 3 | spectra which have spectroscopically confirmed redshifts. This directly 4 | queries the SDSS database for the information, and thus can take a few 5 | minutes to run. 6 | """ 7 | 8 | import os 9 | import urllib, urllib2 10 | import numpy as np 11 | 12 | # Here's how the data can be downloaded directly from the SDSS server. 13 | # This route is limited to N = 50000, so we've done this separately 14 | def fetch_data_sql(N = 50000): 15 | URL = 'http://cas.sdss.org/public/en/tools/search/x_sql.asp' 16 | archive_file = 'sdss_galaxy_colors.npy' 17 | 18 | dtype = [('mags', '5float32'), 19 | ('specClass', 'int8'), 20 | ('z', 'float32'), 21 | ('zerr', 'float32')] 22 | 23 | def sql_query(sql_str, url=URL, format='csv'): 24 | """Execute SQL query""" 25 | # remove comments from string 26 | sql_str = ' \n'.join(map(lambda x: x.split('--')[0], 27 | sql_str.split('\n'))) 28 | params = urllib.urlencode(dict(cmd=sql_str, format=format)) 29 | return urllib.urlopen(url + '?%s' % params) 30 | 31 | query_text = ('\n'.join( 32 | ("SELECT TOP %i" % N, 33 | " modelMag_u, modelMag_g, modelMag_r, modelMag_i, modelMag_z, specClass, z, zErr", 34 | "FROM SpecPhoto", 35 | "WHERE ", 36 | " modelMag_u BETWEEN 0 AND 19.6", 37 | " AND modelMag_g BETWEEN 0 AND 20", 38 | " AND zerr BETWEEN 0 and 0.03", 39 | " AND specClass > 1 -- not UNKNOWN or STAR", 40 | " AND specClass <> 5 -- not SKY", 41 | " AND specClass <> 6 -- not STAR_LATE"))) 42 | 43 | 44 | if not os.path.exists(archive_file): 45 | print "querying for %i objects" % N 46 | print query_text 47 | output = sql_query(query_text) 48 | print "finished. Processing & saving data" 49 | try: 50 | data = np.loadtxt(output, delimiter=',', skiprows=1, dtype=DTYPE) 51 | except: 52 | raise ValueError(output.read()) 53 | np.save(archive_file, data) 54 | else: 55 | print "data already on disk" 56 | 57 | 58 | DATA_URL = ('http://www.astro.washington.edu/users/' 59 | 'vanderplas/pydata/sdss_photoz.npy') 60 | LOCAL_FILE = 'sdss_photoz.npy' 61 | 62 | # data directory is password protected so the public can't access it 63 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() 64 | password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML') 65 | handler = urllib2.HTTPBasicAuthHandler(password_mgr) 66 | opener = urllib2.build_opener(handler) 67 | 68 | # download training data 69 | if not os.path.exists(LOCAL_FILE): 70 | print "downloading data from", DATA_URL 71 | fhandle = opener.open(DATA_URL) 72 | open(LOCAL_FILE, 'wb').write(fhandle.read()) 73 | -------------------------------------------------------------------------------- /doc/data/sdss_spectra/fetch_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib2 3 | import numpy as np 4 | 5 | DATA_URL = ('http://www.astro.washington.edu/users/' 6 | 'vanderplas/pydata/spec4000_corrected.npz') 7 | LOCAL_FILE = 'spec4000_corrected.npz' 8 | 9 | # data directory is password protected so the public can't access it 10 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() 11 | password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML') 12 | handler = urllib2.HTTPBasicAuthHandler(password_mgr) 13 | opener = urllib2.build_opener(handler) 14 | 15 | # download training data 16 | if not os.path.exists(LOCAL_FILE): 17 | print "downloading data from", DATA_URL 18 | fhandle = opener.open(DATA_URL) 19 | open(LOCAL_FILE, 'wb').write(fhandle.read()) 20 | -------------------------------------------------------------------------------- /doc/images/blank_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/images/blank_image.png -------------------------------------------------------------------------------- /doc/includes/big_toc_css.rst: -------------------------------------------------------------------------------- 1 | .. 2 | File to ..include in a document with a big table of content, to give 3 | it 'style' 4 | 5 | .. raw:: html 6 | 7 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /doc/includes/bigger_toc_css.rst: -------------------------------------------------------------------------------- 1 | .. 2 | File to ..include in a document with a very big table of content, to 3 | give it 'style' 4 | 5 | .. raw:: html 6 | 7 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. 3 | We are putting the title as a raw HTML so that it doesn't appear in 4 | the contents 5 | 6 | .. raw:: html 7 | 8 |

Tutorial: Machine Learning for Astronomy with Scikit-learn

9 | 20 | 21 | .. 22 | Here we are building a banner: a javascript selects randomly 4 images in 23 | the list 24 | 25 | .. only:: html 26 | 27 | .. |banner1| image:: auto_examples/images/plot_sdss_filters_1.png 28 | :height: 120 29 | :target: auto_examples/plot_sdss_filters.html 30 | 31 | .. |banner2| image:: auto_examples/images/plot_sdss_filters_2.png 32 | :height: 120 33 | :target: auto_examples/plot_sdss_filters.html 34 | 35 | .. |banner3| image:: auto_examples/images/plot_sdss_images_1.png 36 | :height: 90 37 | :target: auto_examples/plot_sdss_images.html 38 | 39 | .. |banner4| image:: auto_examples/images/plot_ML_flow_chart_1.png 40 | :height: 120 41 | :target: auto_examples/plot_ML_flow_chart.html 42 | 43 | .. |banner5| image:: auto_examples/images/plot_sdss_photoz_1.png 44 | :height: 120 45 | :target: auto_examples/plot_sdss_photoz.html 46 | 47 | .. |banner6| image:: auto_examples/images/plot_sdss_specPCA_1.png 48 | :height: 120 49 | :target: auto_examples/plot_sdss_specPCA.html 50 | 51 | .. |center-div| raw:: html 52 | 53 | 58 | 59 | 79 | 80 | |center-div| |banner1| |banner2| |banner3| |banner4| |banner5| |banner6| |end-div| 81 | 82 | .. only:: html 83 | 84 | .. only:: html 85 | 86 | .. sidebar:: Download 87 | 88 | * Source code: `github `_ 89 | 90 | * PDF of tutorial: :download:`sklearn_tutorial.pdf` 91 | 92 | * Tarball of exercises and notebooks: :download:`exercises.tgz` 93 | 94 | 95 | .. sectionauthor:: Jake Vanderplas 96 | 97 | 98 | .. topic:: AstroML 99 | 100 | For more information on machine learning for Astronomy, see the 101 | `astroML `_ code and examples. 102 | 103 | .. topic:: Machine Learning for Astronomy with scikit-learn 104 | 105 | This tutorial offers a brief introduction to the fields of machine 106 | learning and statistical data analysis, and their application to 107 | several problems in the field of astronomy. These learning tasks 108 | are enabled by the tools available in the open-source package 109 | `scikit-learn`_. 110 | 111 | `scikit-learn`_ is a Python module integrating classic machine 112 | learning algorithms in the tightly-knit world of scientific Python 113 | packages (`numpy`_, `scipy`_, `matplotlib`_). It aims to provide 114 | simple and efficient solutions to learning problems that are accessible 115 | to everybody and reusable in various contexts: 116 | **machine-learning as a versatile tool for science and engineering**. 117 | 118 | Many of the examples and exercises in this tutorial require the 119 | `ipython notebook`_, a tool which provides an intuitive web-based 120 | interactive environment for scientific python. Some of the material 121 | in the notebooks is duplicated in the following pages, but ipython 122 | notebook is required for some parts. For information on how to download 123 | the associated notebooks, see the :ref:`sklearn_tutorial_setup` page. 124 | 125 | .. _`scikit-learn`: http://www.scikit-learn.org 126 | .. _`numpy`: http://numpy.scipy.org 127 | .. _`scipy`: http://www.scipy.org 128 | .. _`matplotlib`: http://matplotlib.sourceforge.net 129 | .. _`ipython notebook`: http://ipython.org/ipython-doc/stable/interactive/htmlnotebook.html 130 | 131 | .. include:: includes/big_toc_css.rst 132 | 133 | .. note:: This document is meant to be used with **scikit-learn version 134 | 0.11+**. Find the latest version `here `_. 135 | 136 | .. toctree:: 137 | :numbered: 138 | :maxdepth: 2 139 | 140 | setup 141 | general_concepts 142 | practical 143 | classification 144 | regression 145 | dimensionality_reduction 146 | exercises 147 | auto_examples/index 148 | 149 | .. toctree:: 150 | :hidden: 151 | 152 | AUTHORS 153 | 154 | .. 155 | FIXME: I need the link below to make sure the banner gets copied to the 156 | target directory. 157 | 158 | 159 | .. only:: html 160 | 161 | .. raw:: html 162 | 163 |
164 | 165 | .. raw:: html 166 | 167 |
168 | 169 | -------------------------------------------------------------------------------- /doc/logos/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/favicon.ico -------------------------------------------------------------------------------- /doc/logos/identity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/identity.pdf -------------------------------------------------------------------------------- /doc/logos/scikit-learn-logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo-small.png -------------------------------------------------------------------------------- /doc/logos/scikit-learn-logo-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo-thumb.png -------------------------------------------------------------------------------- /doc/logos/scikit-learn-logo.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo.bmp -------------------------------------------------------------------------------- /doc/logos/scikit-learn-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/logos/scikit-learn-logo.png -------------------------------------------------------------------------------- /doc/logos/scikit-learn-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 9 | 11 | 12 | 13 | 14 | 22 | 28 | 36 | 43 | 52 | 53 | 54 | 55 | scikits 56 | 57 | machine learning in Python 58 | 59 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | set SPHINXBUILD=sphinx-build 6 | set BUILDDIR=_build 7 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 8 | if NOT "%PAPER%" == "" ( 9 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 10 | ) 11 | 12 | if "%1" == "" goto help 13 | 14 | if "%1" == "help" ( 15 | :help 16 | echo.Please use `make ^` where ^ is one of 17 | echo. html to make standalone HTML files 18 | echo. dirhtml to make HTML files named index.html in directories 19 | echo. pickle to make pickle files 20 | echo. json to make JSON files 21 | echo. htmlhelp to make HTML files and a HTML help project 22 | echo. qthelp to make HTML files and a qthelp project 23 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 24 | echo. changes to make an overview over all changed/added/deprecated items 25 | echo. linkcheck to check all external links for integrity 26 | echo. doctest to run all doctests embedded in the documentation if enabled 27 | goto end 28 | ) 29 | 30 | if "%1" == "clean" ( 31 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 32 | del /q /s %BUILDDIR%\* 33 | goto end 34 | ) 35 | 36 | if "%1" == "html" ( 37 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 38 | echo. 39 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 40 | goto end 41 | ) 42 | 43 | if "%1" == "dirhtml" ( 44 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 45 | echo. 46 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 47 | goto end 48 | ) 49 | 50 | if "%1" == "pickle" ( 51 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 52 | echo. 53 | echo.Build finished; now you can process the pickle files. 54 | goto end 55 | ) 56 | 57 | if "%1" == "json" ( 58 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 59 | echo. 60 | echo.Build finished; now you can process the JSON files. 61 | goto end 62 | ) 63 | 64 | if "%1" == "htmlhelp" ( 65 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 66 | echo. 67 | echo.Build finished; now you can run HTML Help Workshop with the ^ 68 | .hhp project file in %BUILDDIR%/htmlhelp. 69 | goto end 70 | ) 71 | 72 | if "%1" == "qthelp" ( 73 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 74 | echo. 75 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 76 | .qhcp project file in %BUILDDIR%/qthelp, like this: 77 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\nisl.qhcp 78 | echo.To view the help file: 79 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\nisl.ghc 80 | goto end 81 | ) 82 | 83 | if "%1" == "latex" ( 84 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 85 | echo. 86 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 87 | goto end 88 | ) 89 | 90 | if "%1" == "changes" ( 91 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 92 | echo. 93 | echo.The overview file is in %BUILDDIR%/changes. 94 | goto end 95 | ) 96 | 97 | if "%1" == "linkcheck" ( 98 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 99 | echo. 100 | echo.Link check complete; look for any errors in the above output ^ 101 | or in %BUILDDIR%/linkcheck/output.txt. 102 | goto end 103 | ) 104 | 105 | if "%1" == "doctest" ( 106 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 107 | echo. 108 | echo.Testing of doctests in the sources finished, look at the ^ 109 | results in %BUILDDIR%/doctest/output.txt. 110 | goto end 111 | ) 112 | 113 | :end 114 | -------------------------------------------------------------------------------- /doc/notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | *.v2.ipynb -------------------------------------------------------------------------------- /doc/notebooks/01_datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "01_datasets" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Loading Datasets with scikit-learn" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "For the intro portion of this tutorial, we'll be loading several dataset examples. Scikit-learn has methods to access several datasets: we'll explore two of these here." 23 | ] 24 | }, 25 | { 26 | "cell_type": "heading", 27 | "level": 2, 28 | "metadata": {}, 29 | "source": [ 30 | "Loading Iris Data" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "The machine learning community often uses a simple flowers database where each row in the database (or CSV file) is a set of measurements of an individual iris flower. Each sample in this dataset is described by 4 features and can belong to one of the target classes:\n", 38 | "\n", 39 | "- Features in the Iris dataset:\n", 40 | "\n", 41 | " 1. sepal length in cm\n", 42 | " 2. sepal width in cm\n", 43 | " 3. petal length in cm\n", 44 | " 4. petal width in cm\n", 45 | "\n", 46 | "- Target classes to predict:\n", 47 | "\n", 48 | " 1. Iris Setosa\n", 49 | " 2. Iris Versicolour\n", 50 | " 3. Iris Virginica" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "``scikit-learn`` embeds a copy of the iris CSV file along with a helper function to load it into numpy arrays:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "collapsed": false, 63 | "input": [ 64 | "from sklearn.datasets import load_iris\n", 65 | "iris = load_iris()" 66 | ], 67 | "language": "python", 68 | "metadata": {}, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "The features of each sample flower are stored in the ``data`` attribute of the dataset:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "collapsed": false, 81 | "input": [ 82 | "n_samples, n_features = iris.data.shape\n", 83 | "print n_samples\n", 84 | "print n_features\n", 85 | "print iris.data[0]" 86 | ], 87 | "language": "python", 88 | "metadata": {}, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "The information about the class of each sample is stored in the ``target`` attribute of the dataset:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "collapsed": false, 101 | "input": [ 102 | "len(iris.data) == len(iris.target)" 103 | ], 104 | "language": "python", 105 | "metadata": {}, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "collapsed": false, 111 | "input": [ 112 | "iris.target" 113 | ], 114 | "language": "python", 115 | "metadata": {}, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "The names of the classes are stored in the last attribute, namely ``target_names``:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "collapsed": false, 128 | "input": [ 129 | "list(iris.target_names)" 130 | ], 131 | "language": "python", 132 | "metadata": {}, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "The data downloaded from the iris dataset is stored locally, within a subdirectory of your home directory. You can use the following to determine where it is:" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "collapsed": false, 145 | "input": [ 146 | "from sklearn.datasets import get_data_home\n", 147 | "get_data_home()" 148 | ], 149 | "language": "python", 150 | "metadata": {}, 151 | "outputs": [] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Take a moment now to examine this directory and see that the iris data is stored there. You may also be curious about other datasets which are available. These can be found in ``sklearn.datasets``." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "collapsed": false, 163 | "input": [ 164 | "from sklearn import datasets" 165 | ], 166 | "language": "python", 167 | "metadata": {}, 168 | "outputs": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "You can see which datasets are available by using ipython's tab-completion feature. Simply type\n", 175 | "\n", 176 | " ``datasets.fetch_``\n", 177 | "\n", 178 | "or\n", 179 | "\n", 180 | " ``datasets.load_``\n", 181 | "\n", 182 | "and then press the tab key. This will give you a drop-down menu which lists all the datasets that can be fetched." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "collapsed": false, 188 | "input": [], 189 | "language": "python", 190 | "metadata": {}, 191 | "outputs": [] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "Be warned: many of these datasets are quite large! If you start a download and you want to kill it, you can use ipython's \"kernel interrupt\" feature, available in the menu or using the shortcut ``Ctrl-m i``.\n", 198 | "\n", 199 | "(You can press ``Ctrl-m h`` for a list of all ``ipython`` keyboard shortcuts)." 200 | ] 201 | }, 202 | { 203 | "cell_type": "heading", 204 | "level": 2, 205 | "metadata": {}, 206 | "source": [ 207 | "Loading Digits Data" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "Now we'll take a look at another dataset, one where we have to put a bit more thought into how to represent the data." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "collapsed": false, 220 | "input": [ 221 | "from sklearn.datasets import load_digits\n", 222 | "digits = load_digits()\n", 223 | "\n", 224 | "n_samples, n_features = digits.data.shape\n", 225 | "print (n_samples, n_features)" 226 | ], 227 | "language": "python", 228 | "metadata": {}, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "Let's take a look at the data. As with the iris data, we can access the information as follows:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "collapsed": false, 241 | "input": [ 242 | "print digits.data[0]\n", 243 | "print digits.target" 244 | ], 245 | "language": "python", 246 | "metadata": {}, 247 | "outputs": [] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "Each sample has 64 features, representing a hand-written digit. We can plot the images these features represent to gain more insight.\n", 254 | "\n", 255 | "We want to plot figures using pylab: we'll use the following command to make sure the figures appear in-line (this only works within ipython notebook):\n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "collapsed": false, 261 | "input": [ 262 | "%pylab inline" 263 | ], 264 | "language": "python", 265 | "metadata": {}, 266 | "outputs": [] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "We can access the digits data in the same way as the iris data above. Let's plot a sample of the digits" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "collapsed": false, 278 | "input": [ 279 | "import pylab as pl\n", 280 | "\n", 281 | "# set up the figure\n", 282 | "fig = pl.figure(figsize=(8, 8)) # figure size in inches\n", 283 | "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", 284 | "\n", 285 | "# plot the digits: each image is 8x8 pixels\n", 286 | "for i in range(100):\n", 287 | " ax = fig.add_subplot(10, 10, i + 1, xticks=[], yticks=[])\n", 288 | " ax.imshow(digits.data[i].reshape((8, 8)), cmap=pl.cm.binary)\n", 289 | " \n", 290 | " # label the image with the target value\n", 291 | " ax.text(0, 7, str(digits.target[i]), bbox=dict(facecolor='white', edgecolor='none', pad=1))" 292 | ], 293 | "language": "python", 294 | "metadata": {}, 295 | "outputs": [] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "Notice that we are representing each two-dimensional array of pixels as a single vector. This **data representation** is a very important aspect of machine learning. All of the algorithms in scikit-learn accept data in a matrix format, of size ``[n_samples`` $\\times$ ``n_features]``.\n", 302 | "\n", 303 | "With the digits data, we saw above that ``n_samples = 1797``, and ``n_features = 64``: one integer-valued feature for each pixel." 304 | ] 305 | } 306 | ], 307 | "metadata": {} 308 | } 309 | ] 310 | } -------------------------------------------------------------------------------- /doc/notebooks/03_iris_dimensionality.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "03_iris_dimensionality" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Dimensionality Reduction and Visualization" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Dimensionality reduction is the task of deriving a set of new\n", 23 | "artificial features that is smaller than the original feature\n", 24 | "set while retaining most of the variance of the original data.\n", 25 | "Here we'll use a common but powerful dimensionality reduction\n", 26 | "technique called Principal Component Analysis (PCA).\n", 27 | "We'll perform PCA on the iris dataset that we saw before:" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "collapsed": false, 33 | "input": [ 34 | "from sklearn.datasets import load_iris\n", 35 | "iris = load_iris()\n", 36 | "X = iris.data\n", 37 | "y = iris.target" 38 | ], 39 | "language": "python", 40 | "metadata": {}, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "PCA is performed using linear combinations of the original features\n", 48 | "using a truncated Singular Value Decomposition of the matrix X so\n", 49 | "as to project the data onto a base of the top singular vectors.\n", 50 | "If the number of retained components is 2 or 3, PCA can be used\n", 51 | "to visualize the dataset:" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "collapsed": false, 57 | "input": [ 58 | "from sklearn.decomposition import PCA\n", 59 | "pca = PCA(n_components=2, whiten=True).fit(X)" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Once fitted, the pca model exposes the singular vectors in the components_ attribute:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "collapsed": false, 75 | "input": [ 76 | "pca.components_ " 77 | ], 78 | "language": "python", 79 | "metadata": {}, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "collapsed": false, 85 | "input": [ 86 | "pca.explained_variance_ratio_" 87 | ], 88 | "language": "python", 89 | "metadata": {}, 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "collapsed": false, 95 | "input": [ 96 | "pca.explained_variance_ratio_.sum()" 97 | ], 98 | "language": "python", 99 | "metadata": {}, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Let us project the iris dataset along those first two dimensions:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "collapsed": false, 112 | "input": [ 113 | "X_pca = pca.transform(X)" 114 | ], 115 | "language": "python", 116 | "metadata": {}, 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "The dataset has been \u201cnormalized\u201d, which means that the data\n", 124 | "is now centered on both components with unit variance:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "collapsed": false, 130 | "input": [ 131 | "import numpy as np\n", 132 | "np.round(X_pca.mean(axis=0), decimals=5)" 133 | ], 134 | "language": "python", 135 | "metadata": {}, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "collapsed": false, 141 | "input": [ 142 | "np.round(X_pca.std(axis=0), decimals=5)" 143 | ], 144 | "language": "python", 145 | "metadata": {}, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Furthermore the samples components do no longer carry any linear correlation:" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "collapsed": false, 158 | "input": [ 159 | "np.round(np.corrcoef(X_pca.T), decimals=5)" 160 | ], 161 | "language": "python", 162 | "metadata": {}, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "We can visualize the projection using pylab, but first\n", 170 | "let's make sure our ipython notebook is in pylab inline mode" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "collapsed": false, 176 | "input": [ 177 | "%pylab inline" 178 | ], 179 | "language": "python", 180 | "metadata": {}, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "Now we can visualize the results using the following utility function:" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "collapsed": false, 193 | "input": [ 194 | "import pylab as pl\n", 195 | "from itertools import cycle\n", 196 | "\n", 197 | "def plot_PCA_2D(data, target, target_names):\n", 198 | " colors = cycle('rgbcmykw')\n", 199 | " target_ids = range(len(target_names))\n", 200 | " pl.figure()\n", 201 | " for i, c, label in zip(target_ids, colors, target_names):\n", 202 | " pl.scatter(data[target == i, 0], data[target == i, 1],\n", 203 | " c=c, label=label)\n", 204 | " pl.legend()" 205 | ], 206 | "language": "python", 207 | "metadata": {}, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "Now calling this function for our data, we see the plot:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "collapsed": false, 220 | "input": [ 221 | "plot_PCA_2D(X_pca, iris.target, iris.target_names)" 222 | ], 223 | "language": "python", 224 | "metadata": {}, 225 | "outputs": [] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "Note that this projection was determined *without* any information about the\n", 232 | "labels (represented by the colors): this is the sense in which the learning\n", 233 | "is unsupervised. Nevertheless, we see that the projection gives us insight\n", 234 | "into the distribution of the different flowers in parameter space: notably,\n", 235 | "*iris setosa* is much more distinct than the other two species." 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "Note also that the default implementation of PCA computes the SVD of the full\n", 243 | "data matrix, which is not scalable when both ``n_samples`` and\n", 244 | "``n_features`` are big (more that a few thousands).\n", 245 | "If you are interested in a number of components that is much\n", 246 | "smaller than both ``n_samples`` and ``n_features``, consider using\n", 247 | ":class:`sklearn.decomposition.RandomizedPCA` instead." 248 | ] 249 | }, 250 | { 251 | "cell_type": "heading", 252 | "level": 3, 253 | "metadata": {}, 254 | "source": [ 255 | "Exercise:" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Repeat the above dimensionality reduction with\n", 263 | "``sklearn.decomposition.RandomizedPCA``.\n", 264 | "\n", 265 | "You can re-use the ``plot_PCA_2D`` function from above.\n", 266 | "Are the results similar to those from standard PCA?" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "collapsed": false, 272 | "input": [ 273 | "from sklearn.decomposition import RandomizedPCA\n", 274 | "#apply randomized PCA to the iris data as above, and plot the result." 275 | ], 276 | "language": "python", 277 | "metadata": {}, 278 | "outputs": [] 279 | } 280 | ], 281 | "metadata": {} 282 | } 283 | ] 284 | } -------------------------------------------------------------------------------- /doc/notebooks/04_iris_clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "04_iris_clustering" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Clustering of Iris Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Clustering is the task of gathering samples into groups of similar\n", 23 | "samples according to some predefined similarity or dissimilarity\n", 24 | "measure (such as the Euclidean distance).\n", 25 | "\n", 26 | "Let's re-use the results of the 2D PCA of the iris dataset in order to\n", 27 | "explore clustering. First we need to repeat some of the code from the\n", 28 | "previous notebook" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "# make sure ipython inline mode is activated\n", 36 | "%pylab inline" 37 | ], 38 | "language": "python", 39 | "metadata": {}, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "collapsed": false, 45 | "input": [ 46 | "# all of this is taken from the notebook '03_iris_dimensionality.ipynb' \n", 47 | "from sklearn.datasets import load_iris\n", 48 | "from sklearn.decomposition import PCA\n", 49 | "import pylab as pl\n", 50 | "from itertools import cycle\n", 51 | "\n", 52 | "iris = load_iris()\n", 53 | "X = iris.data\n", 54 | "y = iris.target\n", 55 | "\n", 56 | "pca = PCA(n_components=2, whiten=True).fit(X)\n", 57 | "X_pca = pca.transform(X)\n", 58 | "\n", 59 | "def plot_2D(data, target, target_names):\n", 60 | " colors = cycle('rgbcmykw')\n", 61 | " target_ids = range(len(target_names))\n", 62 | " pl.figure()\n", 63 | " for i, c, label in zip(target_ids, colors, target_names):\n", 64 | " pl.scatter(data[target == i, 0], data[target == i, 1],\n", 65 | " c=c, label=label)\n", 66 | " pl.legend()" 67 | ], 68 | "language": "python", 69 | "metadata": {}, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Now we will use one of the simplest clustering algorithms, K-means.\n", 77 | "This is an iterative algorithm which searches for three cluster\n", 78 | "centers such that the distance from each point to its cluster is\n", 79 | "minimizied." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "collapsed": false, 85 | "input": [ 86 | "from sklearn.cluster import KMeans\n", 87 | "from numpy.random import RandomState\n", 88 | "rng = RandomState(42)\n", 89 | "\n", 90 | "kmeans = KMeans(n_clusters=3, random_state=rng).fit(X_pca)" 91 | ], 92 | "language": "python", 93 | "metadata": {}, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "collapsed": false, 99 | "input": [ 100 | "import numpy as np\n", 101 | "np.round(kmeans.cluster_centers_, decimals=2)" 102 | ], 103 | "language": "python", 104 | "metadata": {}, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "collapsed": false, 110 | "input": [ 111 | "kmeans.labels_[:10]" 112 | ], 113 | "language": "python", 114 | "metadata": {}, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "collapsed": false, 120 | "input": [ 121 | "kmeans.labels_[-10:]" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "The K-means algorithm has been used to infer cluster labels for the\n", 132 | "points. Let's call the ``plot_2D`` function again, but color the points\n", 133 | "based on the cluster labels rather than the iris species." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "collapsed": false, 139 | "input": [ 140 | "plot_2D(X_pca, kmeans.labels_, [\"c0\", \"c1\", \"c2\"])\n", 141 | "\n", 142 | "plot_2D(X_pca, iris.target, iris.target_names)" 143 | ], 144 | "language": "python", 145 | "metadata": {}, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "heading", 150 | "level": 3, 151 | "metadata": {}, 152 | "source": [ 153 | "Exercise" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Perform the K-Means cluster search again, but this time learn the\n", 161 | "clusters using the full data matrix ``X``, rather than the projected\n", 162 | "matrix ``X_pca``. Does this change the results? Do these labels\n", 163 | "look closer to the true labels?" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "collapsed": false, 169 | "input": [], 170 | "language": "python", 171 | "metadata": {}, 172 | "outputs": [] 173 | } 174 | ], 175 | "metadata": {} 176 | } 177 | ] 178 | } -------------------------------------------------------------------------------- /doc/notebooks/05_iris_crossval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "05_iris_crossval" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Cross-Validation on the Iris Dataset" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Here is an example on you to split the data on the iris dataset." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Let's re-use the results of the 2D PCA of the iris dataset\n", 30 | "in order to explore clustering. First we need to repeat\n", 31 | "some of the code from the previous notebook:" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "collapsed": false, 37 | "input": [ 38 | "# all of this is taken from the notebook '04_iris_clustering.ipynb'\n", 39 | "import numpy as np\n", 40 | "from sklearn.datasets import load_iris\n", 41 | "\n", 42 | "iris = load_iris()\n", 43 | "X = iris.data\n", 44 | "y = iris.target\n", 45 | "\n", 46 | "n_samples, n_features = iris.data.shape\n", 47 | "print n_samples" 48 | ], 49 | "language": "python", 50 | "metadata": {}, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "First we need to shuffle the order of the samples and the\n", 58 | "target to ensure that all classes are well represented on\n", 59 | "both sides of the split:" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "collapsed": false, 65 | "input": [ 66 | "indices = np.arange(n_samples)\n", 67 | "indices[:10]" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "collapsed": false, 76 | "input": [ 77 | "np.random.RandomState(42).shuffle(indices)\n", 78 | "indices[:10]" 79 | ], 80 | "language": "python", 81 | "metadata": {}, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "collapsed": false, 87 | "input": [ 88 | "X = iris.data[indices]\n", 89 | "y = iris.target[indices]" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "We can now split the data using a 2/3 - 1/3 ratio:" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "collapsed": false, 105 | "input": [ 106 | "split = (n_samples * 2) / 3\n", 107 | "\n", 108 | "X_train, X_test = X[:split], X[split:]\n", 109 | "y_train, y_test = y[:split], y[split:]\n", 110 | "\n", 111 | "X_train.shape" 112 | ], 113 | "language": "python", 114 | "metadata": {}, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "collapsed": false, 120 | "input": [ 121 | "X_test.shape" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "collapsed": false, 130 | "input": [ 131 | "y_train.shape" 132 | ], 133 | "language": "python", 134 | "metadata": {}, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "collapsed": false, 140 | "input": [ 141 | "y_test.shape" 142 | ], 143 | "language": "python", 144 | "metadata": {}, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "We can now re-train a new linear classifier on the training set only:" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "collapsed": false, 157 | "input": [ 158 | "from sklearn.svm import LinearSVC\n", 159 | "clf = LinearSVC().fit(X_train, y_train)" 160 | ], 161 | "language": "python", 162 | "metadata": {}, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "To evaluate its quality we can compute the average number\n", 170 | "of correct classifications on the test set:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "collapsed": false, 176 | "input": [ 177 | "np.mean(clf.predict(X_test) == y_test)" 178 | ], 179 | "language": "python", 180 | "metadata": {}, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "This shows that the model has a predictive accurracy of 100%\n", 188 | "which means that the classification model was perfectly capable\n", 189 | "of generalizing what was learned from the training set to the\n", 190 | "test set: this is rarely so easy on real life datasets as we\n", 191 | "will see in the later sections." 192 | ] 193 | } 194 | ], 195 | "metadata": {} 196 | } 197 | ] 198 | } -------------------------------------------------------------------------------- /doc/notebooks/08_regression_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "08_regression_example" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Regression Example" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "As with the previous example, this notebook assumes that the current\n", 23 | "working directory is in the scikit-learn tutorial directory where\n", 24 | "the notebook is stored. In the folder\n", 25 | "\n", 26 | " ../data/sdss_photoz\n", 27 | "\n", 28 | "there is a script fetch_data.py which will download the colors of 400,000+ galaxies from the Sloan Digital Sky Survey. This script also includes a python implementation of the SQL query used to construct this data. This template can be modified to download more features if desired. Before executing the example below, run fetch_data.py to download the colors and redshifts.\n", 29 | "\n", 30 | "If you're using a different directory structure, then the DATA_HOME variable in the following script should be set accordingly." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "collapsed": false, 36 | "input": [ 37 | "import os\n", 38 | "DATA_HOME = os.path.abspath('../data/sdss_photoz/')" 39 | ], 40 | "language": "python", 41 | "metadata": {}, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "First we will load this data, shuffle it in preparation for later, and arrange the colors in an array of shape (n_samples, n_features):" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [ 55 | "import numpy as np\n", 56 | "data = np.load(os.path.join(DATA_HOME,'sdss_photoz.npy'))" 57 | ], 58 | "language": "python", 59 | "metadata": {}, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "The data is in a record array, as in the classification example" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "collapsed": false, 72 | "input": [ 73 | "print data.dtype.names" 74 | ], 75 | "language": "python", 76 | "metadata": {}, 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Now we'll set up our data matrix ``X`` and redshift ``z``" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "collapsed": false, 89 | "input": [ 90 | "N = len(data)\n", 91 | "X = np.zeros((N, 4))\n", 92 | "X[:, 0] = data['u'] - data['g']\n", 93 | "X[:, 1] = data['g'] - data['r']\n", 94 | "X[:, 2] = data['r'] - data['i']\n", 95 | "X[:, 3] = data['i'] - data['z']\n", 96 | "z = data['redshift']" 97 | ], 98 | "language": "python", 99 | "metadata": {}, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Next we\u2019ll split the data into two samples: a training sample and a test sample which we\u2019ll use to evaluate our training:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "collapsed": false, 112 | "input": [ 113 | "Ntrain = 3 * N / 4\n", 114 | "Xtrain = X[:Ntrain]\n", 115 | "ztrain = z[:Ntrain]\n", 116 | "Xtest = X[Ntrain:]\n", 117 | "ztest = z[Ntrain:]" 118 | ], 119 | "language": "python", 120 | "metadata": {}, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Now we\u2019ll use the scikit-learn ``DecisionTreeRegressor`` method\n", 128 | "to train a model and predict redshifts for the test set based\n", 129 | "on a 20-level decision tree:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "collapsed": false, 135 | "input": [ 136 | "from sklearn.tree import DecisionTreeRegressor\n", 137 | "clf = DecisionTreeRegressor(max_depth=20)\n", 138 | "clf.fit(Xtrain, ztrain)\n", 139 | "zpred = clf.predict(Xtest)" 140 | ], 141 | "language": "python", 142 | "metadata": {}, 143 | "outputs": [] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "To judge the efficacy of prediction, we can compute the\n", 150 | "root-mean-square (RMS) difference between the true and predicted values:" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "collapsed": false, 156 | "input": [ 157 | "rms = np.sqrt(np.mean((ztest - zpred) ** 2))\n", 158 | "print rms" 159 | ], 160 | "language": "python", 161 | "metadata": {}, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Our RMS error is about 0.22. This is pretty good for such an unsophisticated\n", 169 | "learning algorithm, but better algorithms can improve on this. The biggest\n", 170 | "issue here are the catastrophic errors, where the predicted redshift is\n", 171 | "extremely far from the prediction:" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "collapsed": false, 177 | "input": [ 178 | "print \"Number of test samples: \", len(ztest)\n", 179 | "print \"Number of catastrophic errors:\", np.sum(abs(ztest - zpred) > 1)" 180 | ], 181 | "language": "python", 182 | "metadata": {}, 183 | "outputs": [] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "About 1.5% of objects have redshift estimates which are off by greater than 1.\n", 190 | "This sort of error in redshift determination is very problematic for\n", 191 | "high-precision cosmological studies. This can be seen in a scatter plot of\n", 192 | "the predicted redshift versus the true redshift for the test data:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "collapsed": false, 198 | "input": [ 199 | "%pylab inline\n", 200 | "import pylab as pl" 201 | ], 202 | "language": "python", 203 | "metadata": {}, 204 | "outputs": [] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "collapsed": false, 209 | "input": [ 210 | "ax = pl.axes()\n", 211 | "\n", 212 | "pl.scatter(ztest, zpred, c='k', lw=0, s=4)\n", 213 | "axis_lim = np.array([0, 2.5])\n", 214 | "\n", 215 | "# plot the true redshift\n", 216 | "pl.plot(axis_lim, axis_lim, '--k')\n", 217 | "\n", 218 | "# plot +/- the rms\n", 219 | "pl.plot(axis_lim, axis_lim + rms, '--r') \n", 220 | "pl.plot(axis_lim, axis_lim - rms, '--r')\n", 221 | "pl.xlim(axis_lim)\n", 222 | "pl.ylim(axis_lim)\n", 223 | "\n", 224 | "pl.title('Photo-z: Decision Tree Regression')\n", 225 | "pl.xlabel(r'$\\mathrm{z_{true}}$', fontsize=14)\n", 226 | "pl.ylabel(r'$\\mathrm{z_{phot}}$', fontsize=14)" 227 | ], 228 | "language": "python", 229 | "metadata": {}, 230 | "outputs": [] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "The true and predicted redshifts of 102,798 SDSS galaxies, using a simple decision tree regressor. Notice the presece of catastrophic outliers: those galaxies whose predicted redshifts are extremely far from the true value.\n", 237 | "\n", 238 | "Later, in Exercise #2, we will attempt to improve on this by optimizing the parameters of the decision tree.\n", 239 | "\n", 240 | "In practice, the solutions to the photometric redshift problem can benefit from approaches that use physical intuition as well as machine learning tools. For example, some solutions involve the use of libraries of synthetic galaxy spectra which are known to be representative of the true galaxy distribution. This extra information can be used either directly, in a physically motivated analysis, or can be used to generate a larger suite of artificial training instances for a pure machine learning approach." 241 | ] 242 | } 243 | ], 244 | "metadata": {} 245 | } 246 | ] 247 | } -------------------------------------------------------------------------------- /doc/notebooks/nbconvert.py: -------------------------------------------------------------------------------- 1 | """Simple utility script for semi-gracefully downgrading v3 notebooks to v2""" 2 | 3 | import io 4 | import os 5 | import sys 6 | 7 | from IPython.nbformat import current 8 | 9 | def heading_to_md(cell): 10 | """turn heading cell into corresponding markdown""" 11 | cell.cell_type = "markdown" 12 | level = cell.pop('level', 1) 13 | cell.source = '#'*level + ' ' + cell.source 14 | 15 | def raw_to_md(cell): 16 | """let raw passthrough as markdown""" 17 | cell.cell_type = "markdown" 18 | 19 | def downgrade(nb): 20 | """downgrade a v3 notebook to v2""" 21 | if nb.nbformat != 3: 22 | return nb 23 | nb.nbformat = 2 24 | for ws in nb.worksheets: 25 | for cell in ws.cells: 26 | if cell.cell_type == 'heading': 27 | heading_to_md(cell) 28 | elif cell.cell_type == 'raw': 29 | raw_to_md(cell) 30 | return nb 31 | 32 | def downgrade_ipynb(fname): 33 | base, ext = os.path.splitext(fname) 34 | newname = base+'.v2'+ext 35 | print "downgrading %s -> %s" % (fname, newname) 36 | with io.open(fname, 'r', encoding='utf8') as f: 37 | nb = current.read(f, 'json') 38 | nb = downgrade(nb) 39 | with open(newname, 'w') as f: 40 | current.write(nb, f, 'json') 41 | 42 | if __name__ == '__main__': 43 | map(downgrade_ipynb, sys.argv[1:]) 44 | -------------------------------------------------------------------------------- /doc/notebooks/soln/01-01.py: -------------------------------------------------------------------------------- 1 | # 01-01.py 2 | clf_0 = gmm.GMM(1, 'diag') 3 | i0 = (y_train == 0) 4 | clf_0.fit(X_train[i0]) 5 | 6 | clf_1 = gmm.GMM(1, 'diag') 7 | i1 = (y_train == 1) 8 | clf_1.fit(X_train[i1]) 9 | -------------------------------------------------------------------------------- /doc/notebooks/soln/01-02.py: -------------------------------------------------------------------------------- 1 | # 01-02.py 2 | num0 = i0.sum() 3 | num1 = i1.sum() 4 | 5 | prior0 = num0 / float(Ntrain) 6 | prior1 = num1 / float(Ntrain) 7 | -------------------------------------------------------------------------------- /doc/notebooks/soln/01-03.py: -------------------------------------------------------------------------------- 1 | # 01-03.py 2 | logL = np.zeros((2, Ncrossval)) 3 | logL[0] = clf_0.score(X_crossval) + np.log(prior0) 4 | logL[1] = clf_1.score(X_crossval) + np.log(prior1) 5 | -------------------------------------------------------------------------------- /doc/notebooks/soln/01-04.py: -------------------------------------------------------------------------------- 1 | # 01-04.py 2 | def GMMBayes(X_test, n_components, covariance_type): 3 | clf_0 = gmm.GMM(n_components, covariance_type, random_state=0) 4 | i0 = (y_train == 0) 5 | clf_0.fit(X_train[i0]) 6 | 7 | clf_1 = gmm.GMM(n_components, covariance_type, random_state=0) 8 | i1 = (y_train == 1) 9 | clf_1.fit(X_train[i1]) 10 | 11 | logL = np.zeros((2, X_test.shape[0])) 12 | logL[0] = clf_0.score(X_test) + np.log(prior0) 13 | logL[1] = clf_1.score(X_test) + np.log(prior1) 14 | 15 | y_pred = np.argmax(logL, 0) 16 | 17 | return y_pred 18 | -------------------------------------------------------------------------------- /doc/notebooks/soln/01-05.py: -------------------------------------------------------------------------------- 1 | # 01-05.py 2 | y_pred_gmm = GMMBayes(X_test, 5, 'full') 3 | y_pred_gnb = gnb.predict(X_test) 4 | -------------------------------------------------------------------------------- /doc/notebooks/soln/02-01.py: -------------------------------------------------------------------------------- 1 | # 02-01.py 2 | for i, max_depth in enumerate(max_depth_array): 3 | # print progress update 4 | print '%i / %i' % (max_depth, max_depth_array[-1]) 5 | 6 | clf = DecisionTreeRegressor(max_depth=max_depth) 7 | clf.fit(X_train, y_train) 8 | 9 | y_train_pred = clf.predict(X_train) 10 | y_cv_pred = clf.predict(X_cv) 11 | 12 | train_error[i] = compute_rms_error(y_train_pred, y_train) 13 | cv_error[i] = compute_rms_error(y_cv_pred, y_cv) 14 | -------------------------------------------------------------------------------- /doc/notebooks/soln/02-02.py: -------------------------------------------------------------------------------- 1 | #02-02.py 2 | for i, n_samples in enumerate(n_samples_array): 3 | # print progress update 4 | print ' %i / %i' % (n_samples, Ntrain) 5 | 6 | clf = DecisionTreeRegressor(max_depth=max_depth) 7 | clf.fit(X_train[:n_samples], y_train[:n_samples]) 8 | 9 | y_train_pred = clf.predict(X_train[:n_samples]) 10 | y_cv_pred = clf.predict(X_cv) 11 | 12 | train_error_2[i] = compute_rms_error(y_train_pred, 13 | y_train[:n_samples]) 14 | cv_error_2[i] = compute_rms_error(y_cv_pred, y_cv) 15 | -------------------------------------------------------------------------------- /doc/notebooks/soln/02-03a.py: -------------------------------------------------------------------------------- 1 | #02-03a.py 2 | 3 | #------------------------------------------------------------ 4 | # first compute and plot the outlier fraction as a function 5 | # of max_depth 6 | max_depth_array = np.arange(1, 21) 7 | train_error = np.zeros(len(max_depth_array)) 8 | cv_error = np.zeros(len(max_depth_array)) 9 | 10 | for i, max_depth in enumerate(max_depth_array): 11 | # print progress update 12 | print '%i / %i' % (max_depth, max_depth_array[-1]) 13 | 14 | clf = DecisionTreeRegressor(max_depth=max_depth) 15 | clf.fit(X_train, y_train) 16 | 17 | y_train_pred = clf.predict(X_train) 18 | y_cv_pred = clf.predict(X_cv) 19 | 20 | train_error[i] = compute_outlier_fraction(y_train_pred, y_train) 21 | cv_error[i] = compute_outlier_fraction(y_cv_pred, y_cv) 22 | 23 | pl.figure() 24 | pl.plot(max_depth_array, cv_error, label='cross-val error') 25 | pl.plot(max_depth_array, train_error, label='training error') 26 | 27 | pl.legend(loc=0) 28 | pl.xlabel('max depth') 29 | pl.ylabel('error') 30 | 31 | # select the value of max_depth which led to the best results 32 | max_depth = max_depth_array[np.argmin(cv_error)] 33 | print "max_depth = %i" % max_depth 34 | -------------------------------------------------------------------------------- /doc/notebooks/soln/02-03b.py: -------------------------------------------------------------------------------- 1 | #02-03b.py 2 | 3 | #------------------------------------------------------------ 4 | # compute and plot the outlier fraction 5 | # as a function of number of samples 6 | n_samples_array = np.linspace(50, Ntrain, 20).astype(int) 7 | train_error_2 = np.zeros(n_samples_array.shape) 8 | cv_error_2 = np.zeros(n_samples_array.shape) 9 | 10 | for i, n_samples in enumerate(n_samples_array): 11 | # print progress update 12 | print ' %i / %i' % (n_samples, Ntrain) 13 | 14 | clf = DecisionTreeRegressor(max_depth=max_depth) 15 | clf.fit(X_train[:n_samples], y_train[:n_samples]) 16 | 17 | y_train_pred = clf.predict(X_train[:n_samples]) 18 | y_cv_pred = clf.predict(X_cv) 19 | 20 | train_error_2[i] = compute_outlier_fraction(y_train_pred, 21 | y_train[:n_samples]) 22 | cv_error_2[i] = compute_outlier_fraction(y_cv_pred, y_cv) 23 | 24 | pl.figure() 25 | pl.plot(n_samples_array, cv_error_2, label='cross-val error') 26 | pl.plot(n_samples_array, train_error_2, label='training error') 27 | 28 | pl.legend(loc=0) 29 | pl.xlabel('number of samples') 30 | pl.ylabel('error') 31 | 32 | pl.title('max_depth = %s' % max_depth) 33 | -------------------------------------------------------------------------------- /doc/notebooks/soln/03-01.py: -------------------------------------------------------------------------------- 1 | #03-01.py 2 | X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None) 3 | 4 | from sklearn.manifold import LocallyLinearEmbedding 5 | lle = LocallyLinearEmbedding(n_neighbors=15, 6 | n_components=3, method='standard') 7 | X_proj = lle.fit_transform(X) 8 | 9 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True) 10 | -------------------------------------------------------------------------------- /doc/notebooks/soln/03-02.py: -------------------------------------------------------------------------------- 1 | #03-02.py 2 | X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None) 3 | 4 | from sklearn.manifold import LocallyLinearEmbedding 5 | lle = LocallyLinearEmbedding(n_neighbors=15, 6 | n_components=3, method='modified') 7 | X_proj = lle.fit_transform(X) 8 | 9 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True) 10 | -------------------------------------------------------------------------------- /doc/notebooks/soln/03-03.py: -------------------------------------------------------------------------------- 1 | #03-03.py 2 | X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None) 3 | 4 | from sklearn.manifold import Isomap 5 | iso = Isomap(n_neighbors=15, n_components=3) 6 | X_proj = iso.fit_transform(X) 7 | 8 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True) 9 | -------------------------------------------------------------------------------- /doc/regression.rst: -------------------------------------------------------------------------------- 1 | .. _astronomy_regression: 2 | 3 | ============================================= 4 | Regression: Photometric Redshifts of Galaxies 5 | ============================================= 6 | 7 | Another important learning task in astronomy is the problem of determining 8 | `redshifts `_ of distant galaxies. 9 | In the current standard cosmological model, the 10 | universe began nearly 14 billion years ago, in an explosive event commonly 11 | known as the Big Bang. Since then, the very fabric of space has been 12 | `expanding `_, 13 | so that distant galaxies appear to be moving away from us at 14 | very high speeds. The uniformity of this expansion means that there is 15 | a relationship between the distance to a galaxy, and the speed that it 16 | appears to be receeding from us (this relationship is known as 17 | `Hubble's Law `_, named 18 | after Edwin Hubble). This recession speed leads to a shift 19 | in the frequency of photons, very similar to the more familiar 20 | `doppler shift `_ that 21 | causes the pitch of a siren to change as an emergency vehicle passes by. 22 | If a galaxy or star were 23 | moving toward us, its light would be shifted to higher frequencies, or 24 | `blue-shifted`. Because the universe is expanding away from us, distant 25 | galaxies appear to be `red-shifted`: their photons are shifted to lower 26 | frequencies. 27 | 28 | In cosmology, the redshift is measured with the parameter :math:`z`, defined 29 | in terms of the observed wavelength :math:`\lambda_{obs}` and the emitted 30 | wavelength :math:`\lambda_{em}`: 31 | 32 | .. math:: 33 | \lambda_{obs} = (1 + z)\lambda_{em} 34 | 35 | When a spectrum can be obtained, determining the redshift is rather 36 | straight-forward: if you can localize the spectral fingerprint of a common 37 | element, such as hydrogen, then the redshift can be computed using simple 38 | arithmetic. But similarly to the case of Star/Quasar classification, the 39 | task becomes much more difficult when only photometric observations are 40 | available. 41 | 42 | Because of the spectrum shift, an identical source at different redshifts 43 | will have a different color through each pair of filters. See the following 44 | figure: 45 | 46 | .. figure:: auto_examples/images/plot_sdss_filters_2.png 47 | :target: auto_examples/plot_sdss_filters.html 48 | :align: center 49 | :scale: 80% 50 | 51 | The spectrum of the star Vega (:math:`\alpha`-Lyr) at three different 52 | redshifts. The SDSS ugriz filters are shown in gray for reference. 53 | 54 | At redshift :math:`z=0.0`, the spectrum is bright in the `u` and `g` filters, 55 | but dim in the `i` and `z` filters. At redshift :math:`z=0.8`, the opposite 56 | is the case. This suggests the possibility of determining redshift from 57 | photometry alone. The situation is complicated by the fact that each 58 | individual source has unique spectral characteristics, but nevertheless, 59 | these `photometric redshifts` are often used in astronomical applications. 60 | 61 | Motivation: Dark Energy, Dark Matter, and the Fate of the Universe 62 | ------------------------------------------------------------------ 63 | The photometric redshift problem is very important. Future astronomical 64 | surveys hope to image trillions of very faint galaxies, and use this data 65 | to inform our view of the universe as a whole: its history, its geometry, 66 | and its fate. Obtaining an accurate estimate of the redshift to each of these 67 | galaxies is a pivotal part of this task. Because these surveys will image 68 | so many extremely faint galaxies, there is no possibility of obtaining a 69 | spectrum for each one. Thus sophisticated photometric redshift codes will 70 | be required to advance our understanding of the Universe, including more 71 | precisely understanding the nature of the dark energy that is currently 72 | accelerating the cosmic expansion. 73 | 74 | A Simple Method: Decision Tree Regression 75 | ----------------------------------------- 76 | 77 | .. currentmodule:: sklearn.tree 78 | 79 | .. note:: 80 | 81 | The information in this section is available in an interactive notebook 82 | :download:`08_regression_example.ipynb `, 83 | which can be viewed using `iPython notebook`_. An online static view can 84 | be seen `here `_. 85 | 86 | Here we'll take an extremely simple approach to the photometric redshift 87 | problem, using a decision tree. 88 | In the folder ``$TUTORIAL_HOME/data/sdss_photoz``, there is a script 89 | ``fetch_data.py`` which will download the colors of 400,000+ galaxies from 90 | the Sloan Digital Sky Survey. This script also includes a python 91 | implementation of the SQL query used to construct this data. This template 92 | can be modified to download more features if desired. 93 | Before executing the example below, run ``fetch_data.py`` 94 | to download the colors and redshifts. 95 | 96 | First we will load this data, shuffle it in preparation for later, and arrange 97 | the colors in an array of shape ``(n_samples, n_features)``:: 98 | 99 | >>> import numpy as np 100 | >>> data = np.load('data/sdss_photoz/sdss_photoz.npy') 101 | >>> N = len(data) 102 | >>> X = np.zeros((N, 4)) 103 | >>> X[:, 0] = data['u'] - data['g'] 104 | >>> X[:, 1] = data['g'] - data['r'] 105 | >>> X[:, 2] = data['r'] - data['i'] 106 | >>> X[:, 3] = data['i'] - data['z'] 107 | >>> z = data['redshift'] 108 | 109 | Next we'll split the data into two samples: a training sample and a test 110 | sample which we'll use to evaluate our training:: 111 | 112 | >>> Ntrain = 3 * N / 4 113 | >>> Xtrain = X[:Ntrain] 114 | >>> ztrain = z[:Ntrain] 115 | >>> Xtest = X[Ntrain:] 116 | >>> ztest = z[Ntrain:] 117 | 118 | Now we'll use the scikit-learn :class:`DecisionTreeRegressor` method to 119 | train a model and predict redshifts for the test set based on a 120 | 20-level decision tree:: 121 | 122 | >>> from sklearn.tree import DecisionTreeRegressor 123 | >>> clf = DecisionTreeRegressor( 124 | >>> clf.fit(Xtrain, ztrain) 125 | >>> zpred = clf.predict(Xtest) 126 | 127 | To judge the efficacy of prediction, we can compute the root-mean-square 128 | difference between the true and predicted values:: 129 | 130 | >>> rms = np.sqrt(np.mean((ztest - zpred) ** 2)) 131 | >>> print rms 132 | 0.221409442926 133 | 134 | Our RMS error is about 0.22. This is pretty good for such an unsophisticated 135 | learning algorithm, but better algorithms can improve on this. The biggest 136 | issue here are the `catastrophic errors`, where the predicted redshift is 137 | extremely far from the prediction:: 138 | 139 | >>> print len(ztest) 140 | 102798 141 | >>> print np.sum(abs(ztest - zpred) > 1) 142 | 1538 143 | 144 | About 1.5% of objects have redshift estimates which are off by greater than 145 | 1. This sort of error in redshift determination is very problematic for 146 | high-precision cosmological studies. This can be seen in a scatter plot of 147 | the predicted redshift versus the true redshift for the test data: 148 | 149 | .. figure:: auto_examples/images/plot_sdss_photoz_1.png 150 | :target: auto_examples/plot_sdss_photoz.html 151 | :align: center 152 | :scale: 80% 153 | 154 | The true and predicted redshifts of 102,798 SDSS galaxies, using a simple 155 | decision tree regressor. Notice the presece of catastrophic outliers: 156 | those galaxies whose predicted redshifts are extremely far from the true 157 | value. 158 | 159 | Later, in :ref:`Exercise #2 `, we will attempt 160 | to improve on this by optimizing the parameters of the decision tree. 161 | 162 | In practice, the solutions to the photometric redshift problem can benefit from 163 | approaches that use physical intuition as well as machine learning tools. 164 | For example, some solutions involve the use of libraries of synthetic 165 | galaxy spectra 166 | which are known to be representative of the true galaxy distribution. This 167 | extra information can be used either directly, in a physically motivated 168 | analysis, or can be used to generate a larger suite of artificial training 169 | instances for a pure machine learning approach. 170 | 171 | .. _`iPython notebook`: http://ipython.org/ipython-doc/stable/interactive/htmlnotebook.html 172 | -------------------------------------------------------------------------------- /doc/scikitlearn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/scikitlearn.png -------------------------------------------------------------------------------- /doc/setup.rst: -------------------------------------------------------------------------------- 1 | .. _sklearn_tutorial_setup: 2 | 3 | =============================== 4 | Tutorial Setup and Installation 5 | =============================== 6 | 7 | .. topic:: Objectives 8 | 9 | At the end of this section, you will 10 | 11 | 1. Have scikit-learn and all the prerequisites and dependencies for 12 | this tutorial installed on your machine. 13 | 2. Download the source files and data required for this tutorial 14 | 15 | Python Prerequisites 16 | -------------------- 17 | 18 | This tutorial is based on scikit-learn, which has the following dependencies: 19 | 20 | - `numpy `_ : this is a python module which has powerful 21 | tools for the creation and manipulation of arrays. It is the foundation of 22 | most scientific computing packages in python 23 | 24 | - `scipy `_ : this is a python module which builds on 25 | numpy and provides fast implementations of many basic scientific algorithms. 26 | 27 | - `matplotlib `_ : this is a powerful 28 | package for generating plots, figures, and diagrams. Our main form of 29 | visual interaction with data and results depends on matplotlib. 30 | 31 | We will also make extensive use of `iPython `_, an 32 | interactive python interpreter. In particular, much of the interactive 33 | material requires `ipython notebook`_ functionality, 34 | which was introduced in ipython version 0.12. 35 | 36 | Installing scikit-learn and Dependencies 37 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 38 | 39 | Please refer to the `install page `_ for 40 | per-system instructions on installing scikit-learn. In addition to 41 | ``numpy``, ``scipy``, and ``scikit-learn``, this tutorial will assume that 42 | you have ``matplotlib`` and ``ipython`` installed as well. 43 | 44 | * Under **Debian or Ubuntu Linux** you should use:: 45 | 46 | % sudo apt-get install build-essential python-dev python-numpy \ 47 | python-numpy-dev python-scipy libatlas-dev g++ python-matplotlib \ 48 | ipython 49 | 50 | If you are under Ubuntu 12.04+, then you can use 51 | 52 | % sudo apt-get install ipython-notebook 53 | 54 | to install ipython notebook with all dependencies. 55 | 56 | * Under **MacOSX** you should probably use a scientific python distribution 57 | such as `Scipy Superpack`_ 58 | 59 | * Under **Windows** the `Python(x,y)`_ is probably your best bet to get a 60 | working numpy / scipy environment up and running. 61 | 62 | * Power-users may wish to install bleeding edge versions of these 63 | packages from the source. The source can be downloaded using 64 | ``git`` from the packages' respective `GitHub`_ repositories. 65 | 66 | Alternatively under Windows and MaxOSX you can use the EPD_ (Enthought 67 | Python Distribution) which is a (non-open source) packaging of the 68 | scientific python stack. 69 | 70 | .. note:: 71 | 72 | that to use `ipython notebook`_, you must install ``ipython`` version 73 | 0.12 and several other dependencies. Refer to the ipython documentation 74 | for details. 75 | 76 | .. _`Scipy Superpack`: http://fonnesbeck.github.com/ScipySuperpack/ 77 | .. _`Python(x,y)`: http://www.pythonxy.com/ 78 | .. _EPD: https://www.enthought.com/products/epd.php 79 | .. _GitHub: http://www.github.com 80 | .. _`ipython notebook`: http://ipython.org/ipython-doc/stable/interactive/htmlnotebook.html 81 | 82 | 83 | Tutorial Files 84 | -------------- 85 | The source code for the example files in the following pages is best 86 | accessed through cloning the scikit-learn repository using 87 | `git `_. Once ``git`` is installed the 88 | command to accomplish this is:: 89 | 90 | % git clone https://github.com/astroML/sklearn_tutorial 91 | 92 | This creates a directory called ``sklearn_tutorial`` and copies all 93 | the source files of this tutorial. Most of the relevant files are 94 | in the ``sklearn_tutorial/doc`` sub-directory. 95 | In what follows, this directory will be named ``$TUTORIAL_HOME``. It 96 | should contain the following folders: 97 | 98 | * ``data`` - folder to put the datasets used during the tutorial 99 | 100 | * ``skeletons`` - sample incomplete scripts for the exercices 101 | (these should be used only if ``ipython notebook`` is unavailable) 102 | 103 | * ``solutions`` - solutions of the exercices 104 | (these should be used only if ``ipython notebook`` is unavailable) 105 | 106 | * ``notebooks`` - ipython notebooks which provide an interactive interface 107 | to parts of this tutorial. These contain material which is not in the 108 | skeletons and solutions. 109 | 110 | If you are not going to use ipython notebook to run the examples, you 111 | can copy the skeletons into a new folder named ``workspace`` 112 | where you will edit your own files for the exercices while keeping 113 | the original skeletons intact:: 114 | 115 | % cp -r skeletons workspace 116 | 117 | 118 | Download the datasets 119 | --------------------- 120 | 121 | Machine Learning algorithms need data. Go to each ``$TUTORIAL_HOME/data/`` 122 | sub-folder and run the ``fetch_data.py`` script from there (after 123 | having read them first). This will download a dataset to the current 124 | directory. This tutorial has three such datasets; they will be used 125 | in the examples and exercises later on. 126 | 127 | To get all three datasets, run the following:: 128 | 129 | % cd $TUTORIAL_HOME/data/sdss_colors 130 | % python fetch_data.py 131 | 132 | % cd $TUTORIAL_HOME/data/sdss_photoz 133 | % python fetch_data.py 134 | 135 | % cd $TUTORIAL_HOME/data/sdss_spectra 136 | % python fetch_data.py 137 | -------------------------------------------------------------------------------- /doc/skeletons/exercise_01.py: -------------------------------------------------------------------------------- 1 | """ 2 | Astronomy Tutorial: exercise 1 3 | 4 | Classification of photometric sources 5 | 6 | usage: python exercise_01.py datadir 7 | 8 | - datadir is $TUTORIAL_DIR/data/sdss_colors 9 | This directory should contain the files: 10 | - sdssdr6_colors_class_train.npy 11 | - sdssdr6_colors_class.200000.npy 12 | 13 | Description: 14 | In the tutorial, we used a Naive Bayes Classifier to separate Quasars 15 | And Stars. In this exercise, we will extend this classification scheme 16 | using Gaussian Mixture Models. 17 | 18 | The Gaussian Naive Bayes method starts by fitting an N-dimensional gaussian 19 | distribution to each class of data. When a test point is evaluated, the 20 | relative log-likelihood from each distribution is used to predict the most 21 | likely value. We're going to extend this by fitting a sum of gaussians to 22 | each distribution. 23 | 24 | There are several places in this file with code to be filled-in as part of 25 | the exercise. Each of these is labeled TODO below. 26 | """ 27 | import os, sys 28 | import numpy as np 29 | import pylab as pl 30 | from sklearn.mixture import gmm 31 | from sklearn import metrics 32 | 33 | try: 34 | datadir = sys.argv[1] 35 | except: 36 | print __doc__ 37 | sys.exit() 38 | 39 | #---------------------------------------------------------------------- 40 | # Load data files 41 | train_data = np.load(os.path.join(datadir, 42 | 'sdssdr6_colors_class_train.npy')) 43 | test_data = np.load(os.path.join(datadir, 44 | 'sdssdr6_colors_class.200000.npy')) 45 | 46 | # set the number of training points: using all points leads to a very 47 | # long running time. We'll start with 10000 training points. This 48 | # can be increased if desired. 49 | Ntrain = 10000 50 | #Ntrain = len(train_data) 51 | 52 | np.random.seed(0) 53 | np.random.shuffle(train_data) 54 | train_data = train_data[:Ntrain] 55 | 56 | #---------------------------------------------------------------------- 57 | # Split training data into training and cross-validation sets 58 | N_crossval = Ntrain / 5 59 | train_data = train_data[:-N_crossval] 60 | crossval_data = train_data[-N_crossval:] 61 | 62 | #---------------------------------------------------------------------- 63 | # Set up data 64 | # 65 | X_train = np.zeros((train_data.size, 4), dtype=float) 66 | X_train[:, 0] = train_data['u-g'] 67 | X_train[:, 1] = train_data['g-r'] 68 | X_train[:, 2] = train_data['r-i'] 69 | X_train[:, 3] = train_data['i-z'] 70 | y_train = (train_data['redshift'] > 0).astype(int) 71 | Ntrain = len(y_train) 72 | 73 | X_crossval = np.zeros((crossval_data.size, 4), dtype=float) 74 | X_crossval[:, 0] = crossval_data['u-g'] 75 | X_crossval[:, 1] = crossval_data['g-r'] 76 | X_crossval[:, 2] = crossval_data['r-i'] 77 | X_crossval[:, 3] = crossval_data['i-z'] 78 | y_crossval = (crossval_data['redshift'] > 0).astype(int) 79 | Ncrossval = len(y_crossval) 80 | 81 | #====================================================================== 82 | # Recreating Gaussian Naive Bayes 83 | # 84 | # Here we will use Gaussian Mixture Models to duplicate our Gaussian 85 | # Naive Bayes results from earlier. You'll create two sklearn.gmm.GMM() 86 | # classifier instances, named `clf_0` and `clf_1`. Each should be 87 | # initialized with a single component, and diagonal covariance. 88 | # (hint: look at the doc string for sklearn.gmm.GMM to see how to set 89 | # this up). The results should be compared to Gaussian Naive Bayes 90 | # to check if they're correct. 91 | # 92 | # Objects to create: 93 | # - clf_0 : trained on the portion of the training data with y == 0 94 | # - clf_1 : trained on the portion of the training data with y == 1 95 | 96 | # TODO: compute clf_0, clf_1 97 | 98 | # next we must construct the prior. The prior is the fraction of training 99 | # points of each type. 100 | # 101 | # variables to compute: 102 | # - prior0 : fraction of training points with y == 0 103 | # - prior1 : fraction of training points with y == 1 104 | 105 | # TODO: compute prior0, prior1 106 | 107 | # Now we use the prior and the classifiation to compute the log-likelihoods 108 | # of the cross-validation points. The log likelihood is given by 109 | # 110 | # logL(x) = clf.score(x) + log(prior) 111 | # 112 | # You can use the function np.log() to compute the logarithm of the prior. 113 | # variables to compute: 114 | # logL : array, shape = (2, Ncrossval) 115 | # logL[0] is the log-likelihood for y == 0 116 | # logL[1] is the log-likelihood for y == 1 117 | logL = None 118 | 119 | # TODO: compute logL 120 | 121 | # the predicted value for each sample is the index with the largest 122 | # log-likelihood. 123 | y_pred = np.argmax(logL, 0) 124 | 125 | # now we print the results. We'll use the built-in classification 126 | # report function in sklearn.metrics. This computes the precision, 127 | # recall, and f1-score for each class. 128 | 129 | print "------------------------------------------------------------" 130 | print "One-component Gaussian Mixture:" 131 | print " results for cross-validation set:" 132 | print metrics.classification_report(y_crossval, y_pred, 133 | target_names=['stars', 'QSOs']) 134 | 135 | 136 | 137 | #---------------------------------------------------------------------- 138 | # Run Gaussian Naive Bayes to double-check that our results are correct. 139 | # Because of rounding errors, it will not be exact, but the results should 140 | # be very close. 141 | from sklearn.naive_bayes import GaussianNB 142 | gnb = GaussianNB() 143 | gnb.fit(X_train, y_train) 144 | y_pred = gnb.predict(X_crossval) 145 | 146 | print "------------------------------------------------------------" 147 | print "Gaussian Naive Bayes" 148 | print " results for cross-validation set:" 149 | print " (results should be within ~0.01 of above results)" 150 | print metrics.classification_report(y_crossval, y_pred, 151 | target_names=['stars', 'QSOs']) 152 | 153 | #====================================================================== 154 | # Parameter optimization: 155 | # 156 | # Now take some time to experiment with the covariance type and the 157 | # number of components, to see if you can optimize the F1 score 158 | # 159 | # Note that for a large number of components, the fit can take a long 160 | # time, and will be dependent on the starting position. Use the 161 | # documentation string of GMM to determine the options for covariance. 162 | # 163 | # It may be helpful to use only a subset of the training data while 164 | # experimenting with these parameter values. This is called 165 | # "Meta-parameter optimization". It can be accomplished automatically, 166 | # but here we are doing it by hand for learning purposes. 167 | y_pred = None 168 | 169 | # TODO: compute y_pred for cross-validation data 170 | 171 | print "------------------------------------------------------------" 172 | print "GMM with tweaked parameters:" 173 | print " results for cross-validation set" 174 | print metrics.classification_report(y_crossval, y_pred, 175 | target_names=['stars', 'QSOs']) 176 | 177 | #---------------------------------------------------------------------- 178 | # Test Data 179 | # once you have maximized the cross-validation, you can apply the estimator 180 | # to your test data, and check how it compares to the predicted results 181 | # from the researcher who compiled it. 182 | 183 | X_test = np.zeros((test_data.size, 4), dtype=float) 184 | X_test[:, 0] = test_data['u-g'] 185 | X_test[:, 1] = test_data['g-r'] 186 | X_test[:, 2] = test_data['r-i'] 187 | X_test[:, 3] = test_data['i-z'] 188 | y_pred_literature = (test_data['label'] == 0).astype(int) 189 | Ntest = len(y_pred_literature) 190 | 191 | # here you should compute y_pred for the test data, using the classifiers 192 | # clf_0 and clf_1 which you already trained above. 193 | 194 | y_pred = None 195 | 196 | # TODO: compute y_pred for test data 197 | 198 | print "------------------------------------------------------------" 199 | print "Comparison of current results with published results" 200 | print " results for test set" 201 | print " (treating published results as the 'true' result)" 202 | print metrics.classification_report(y_pred_literature, y_pred, 203 | target_names=['stars', 'QSOs']) 204 | -------------------------------------------------------------------------------- /doc/skeletons/exercise_02.py: -------------------------------------------------------------------------------- 1 | """ 2 | Astronomy Tutorial: exercise 2 3 | 4 | Photometric redshift determination 5 | 6 | usage: python exercise_02.py datadir 7 | 8 | - datadir is $TUTORIAL_DIR/data/sdss_photoz 9 | This directory should contain the files: 10 | - sdss_photoz.npy 11 | 12 | Here we will take a closer look at the photometric redshift problem discussed 13 | in section 5 of the tutorial. Using the decision tree classifier, we'll take 14 | a look at the 4-color observations of just over 400,000 points. 15 | 16 | The point of this exercise is to answer the question: how can we get the rms 17 | error down to below 0.1? Would it be a better use of telescope time to 18 | observe more objects, or to observe additional features of the objects 19 | in the data set? We'll use the techniques discussed in section 3 of the 20 | tutorial. 21 | """ 22 | import os, sys 23 | import numpy as np 24 | import pylab as pl 25 | 26 | from sklearn.tree import DecisionTreeRegressor 27 | from sklearn import metrics 28 | 29 | try: 30 | datadir = sys.argv[1] 31 | except: 32 | print __doc__ 33 | sys.exit() 34 | 35 | def compute_rms_error(y_pred, y_true): 36 | """Compute the rms error between the arrays y_pred and y_true""" 37 | return np.sqrt(metrics.mean_squared_error(y_pred, y_true)) 38 | 39 | def compute_outlier_fraction(y_pred, y_true, cutoff=0.2): 40 | """Compute the outlier rate between the arrays y_pred and y_true""" 41 | return np.sum((abs(y_pred - y_true) > cutoff)) * 1. / len(y_pred) 42 | 43 | #------------------------------------------------------------ 44 | # load data and compute colors 45 | data = np.load(os.path.join(datadir, 'sdss_photoz.npy')) 46 | 47 | # here we'll truncate the data to 50,000 points. This will allow the code 48 | # below to be run quickly while it's being written. When you're satisfied 49 | # that the code is ready to go, you can comment out this line. 50 | data = data[:50000] 51 | 52 | print '%i points' % data.shape[0] 53 | u, g, r, i, z = [data[f] for f in 'ugriz'] 54 | 55 | X = np.zeros((len(data), 4)) 56 | X[:, 0] = u - g 57 | X[:, 1] = g - r 58 | X[:, 2] = r - i 59 | X[:, 3] = i - z 60 | 61 | y = data['redshift'] 62 | 63 | #------------------------------------------------------------ 64 | # divide into training, cross-validation, and test samples 65 | Ntot = len(y) 66 | 67 | Ncv = Ntot / 5 68 | Ntest = Ntot / 5 69 | Ntrain = Ntot - Ncv - Ntest 70 | 71 | X_train = X[:Ntrain] 72 | y_train = y[:Ntrain] 73 | 74 | X_cv = X[Ntrain:Ntrain + Ncv] 75 | y_cv = y[Ntrain:Ntrain + Ncv] 76 | 77 | X_test = X[Ntrain + Ncv:] 78 | y_test = y[Ntrain + Ncv:] 79 | 80 | #------------------------------------------------------------ 81 | # plot the Decision Tree error as a function of max_depth parameter 82 | # 83 | # This is the first main part of the exercise. This is photometric 84 | # redshift determination using DecisionTreeRegressor. Here you'll plot 85 | # the training error and cross-validation error as a function of the 86 | # meta-parameter 'max_depth'. 87 | # 88 | # You will create three arrays: max_depth_array, train_error, and cv_error. 89 | # Use at least 10 different values of max_depth, and compute the training 90 | # and cross-validation error associated with each of them. 91 | # 92 | # note that the error can be computed with the function compute_rms_error() 93 | 94 | max_depth_array = [] 95 | train_error = [] 96 | cv_error = [] 97 | 98 | # TODO: compute the arrays max_depth_array, train_error, and cv_error 99 | 100 | pl.figure() 101 | pl.plot(max_depth_array, cv_error, label='cross-val error') 102 | pl.plot(max_depth_array, train_error, label='training error') 103 | 104 | pl.legend() 105 | pl.xlabel('max depth') 106 | pl.ylabel('error') 107 | 108 | # select the value of max_depth which led to the best results 109 | max_depth = max_depth_array[np.argmin(cv_error)] 110 | print "max_depth = %i" % max_depth 111 | 112 | #------------------------------------------------------------ 113 | # plot the Decision Tree error as a function of number of samples 114 | # 115 | # This is the second main part of the exercise. Here you'll plot the 116 | # training error and cross-validation error as a function of the 117 | # number of training samples. 118 | # 119 | # You will create three arrays: n_samples_array, train_error, and cv_error. 120 | # Use at least 40 different values of n_samples, and compute the training 121 | # and cross-validation error associated with each of them. 122 | # 123 | # Make sure that when computing the training error for each number of 124 | # samples, you use the same samples that the model was trained on. 125 | 126 | n_samples_array = [] 127 | train_error = [] 128 | cv_error = [] 129 | 130 | # TODO: compute the arrays n_samples_array, train_error, and cv_error 131 | 132 | pl.figure() 133 | pl.plot(n_samples_array, cv_error, label='cross-val error') 134 | pl.plot(n_samples_array, train_error, label='training error') 135 | 136 | pl.legend() 137 | pl.xlabel('number of samples') 138 | pl.ylabel('error') 139 | 140 | #---------------------------------------------------------------------- 141 | # Use the whole dataset: 142 | # If you have been running your code on only a part of the dataset, 143 | # now that you have it working, you can run it on the full dataset 144 | # (note: this will take a long time to execute!) You can do this by 145 | # commenting out the line 146 | # data = data[:50000] 147 | # above. How does this change the results? 148 | 149 | 150 | #------------------------------------------------------------ 151 | # Catastrophic Outliers 152 | # Though the rms error is one useful measure of the performance of an 153 | # algorithm, astronomers are often more interested in reducing the 154 | # 'catastrophic outlier' rate. Catastrophic outliers are points which 155 | # are given redshifts very far from the true value. For accuracy of 156 | # cosmological results, this is often more important than the overall 157 | # rms error. 158 | # 159 | # Here, you can re-implement te above tasks, plotting the catastrophic 160 | # outlier rate as a function of the max_depth parameter, and as a function 161 | # of the number of training points. This can be accomplished either by 162 | # copying and pasting the above code here, or by modifying the above code. 163 | # 164 | # To compute the catastrophic error rate, you can use the function 165 | # compute_outlier_fraction() 166 | 167 | # TODO: repeat the above two plots using catastrophic error rate 168 | 169 | #---------------------------------------------------------------------- 170 | # Analyze the results 171 | # 172 | # Compare your results to the discussion of bias and variance in section 173 | # 3. How do you think these results could be improved? Is it better to 174 | # spend telescope time increasing the size of the training set, or would 175 | # it be better to measure more features of the objects we already have? 176 | # Does this recommendation change if the astronomer is interested in 177 | # minimizing the number of catastrophic outliers rather than the rms error? 178 | 179 | pl.show() 180 | -------------------------------------------------------------------------------- /doc/skeletons/exercise_03.py: -------------------------------------------------------------------------------- 1 | """ 2 | Astronomy Tutorial: exercise 3 3 | 4 | Dimensionality reduction of stellar spectra 5 | 6 | Usage: python exercise_03.py datadir [-m method] [-k n_neigbors] 7 | [-n norm_type] [-N n_samples] 8 | [-s] 9 | 10 | - datadir is $TUTORIAL_DIR/data/sdss_photoz 11 | This directory should contain the file sdss_photoz.npy 12 | 13 | - method is one of [pca | lle | mlle | isomap]. If not specified, 14 | PCA will be performed 15 | 16 | - n_neighbors is an integer number of neighbors to use with manifold methods 17 | 18 | - norm_type is one of [none | l1 | l2]. It specifies how the data should 19 | be normalized. 20 | 21 | - n_samples is the number of samples used for the projection. Only 1000 22 | of the 4000 samples are used by default. 23 | 24 | - specifying -s shuffles the data. This can help test for stability of 25 | the reconstruction. 26 | 27 | Description: 28 | In this tutorial, we explore manifold learning techniques to visualize 4000 29 | SDSS spectral data. This is a much more exploratory exercise than the previous 30 | two. The goal is to determine how to best visualize this high-dimensional 31 | space. You will implement PCA, LLE, Modified LLE, and Isomap, for various 32 | data normalizations. The goal is to find the best visualization of the 33 | data, where "best" in this case is a qualitative measure of how well the 34 | different classes of points are separated in the projected space. 35 | 36 | To make experimentation more streamlined 37 | 38 | There are several places in this file with code to be filled-in as part of 39 | the exercise. Each of these is labeled TODO below. 40 | """ 41 | 42 | import os, sys 43 | import numpy as np 44 | 45 | import pylab as pl 46 | from matplotlib import ticker 47 | 48 | from sklearn import preprocessing 49 | from sklearn.decomposition import RandomizedPCA 50 | from sklearn.manifold import LocallyLinearEmbedding, Isomap 51 | 52 | #---------------------------------------------------------------------- 53 | # set up command-line option parser 54 | from optparse import OptionParser 55 | parser = OptionParser(usage=__doc__, 56 | version="%prog 1.0") 57 | parser.add_option("-m", "--method", 58 | dest="method", 59 | default='pca', 60 | help="Specify method to use: [pca | lle | mlle | isomap]") 61 | 62 | parser.add_option("-k", "--neighbors", 63 | dest="n_neighbors", 64 | type="int", 65 | default=15, 66 | help='Specify number of neighbors for manifold learning') 67 | 68 | parser.add_option("-N", "--normalization", 69 | dest="normalization", 70 | default="none", 71 | help="Specify normalization: [none | l1 | l2]") 72 | 73 | parser.add_option("-n", "--n_samples", 74 | dest="n_samples", 75 | type="int", 76 | default=1000, 77 | help="Specify number of samples to use, up to 4000 (default 1000)") 78 | 79 | parser.add_option("-s", "--shuffle", 80 | dest="shuffle", 81 | action="store_true", 82 | default=False, 83 | help="shuffle the data") 84 | 85 | 86 | options, args = parser.parse_args() 87 | 88 | if len(args) == 0: 89 | parser.error("Must specify a data directory") 90 | elif len(args) > 1: 91 | parser.error("Must specify a single data directory") 92 | 93 | datadir = args[0] 94 | 95 | print "data directory: %s" % datadir 96 | print " method = %s" % options.method 97 | print " n_neighbors = %i" % options.n_neighbors 98 | print " normalization = %s" % options.normalization 99 | print " n_samples: %i" % options.n_samples 100 | print " shuffle: %s" % options.shuffle 101 | 102 | 103 | def three_component_plot(c1, c2, c3, color, labels): 104 | pl.figure(figsize=(8,8)) 105 | kwargs = dict(s=4, lw=0, c=color, vmin=2, vmax=6) 106 | ax1 = pl.subplot(221) 107 | pl.scatter(c1, c2, **kwargs) 108 | pl.ylabel('component 2') 109 | 110 | ax2 = pl.subplot(223, sharex=ax1) 111 | pl.scatter(c1, c3, **kwargs) 112 | pl.xlabel('component 1') 113 | pl.ylabel('component 3') 114 | 115 | ax3 = pl.subplot(224, sharey=ax2) 116 | pl.scatter(c2, c3, **kwargs) 117 | pl.xlabel('component 2') 118 | 119 | for ax in (ax1, ax2, ax3): 120 | ax.xaxis.set_major_formatter(ticker.NullFormatter()) 121 | ax.yaxis.set_major_formatter(ticker.NullFormatter()) 122 | 123 | pl.subplots_adjust(hspace=0.05, wspace=0.05) 124 | 125 | format = ticker.FuncFormatter(lambda i, *args: labels[i]) 126 | pl.colorbar(ticks = range(2, 7), format=format, 127 | cax = pl.axes((0.52, 0.51, 0.02, 0.39))) 128 | pl.clim(1.5, 6.5) 129 | 130 | 131 | #---------------------------------------------------------------------- 132 | # Load data files 133 | data = np.load(os.path.join(datadir, 'spec4000_corrected.npz')) 134 | 135 | X = data['X'] 136 | y = data['y'] 137 | labels = data['labels'] 138 | 139 | if options.shuffle: 140 | i = np.arange(y.shape[0], dtype=int) 141 | np.random.shuffle(i) 142 | X = X[i] 143 | y = y[i] 144 | 145 | #---------------------------------------------------------------------- 146 | # truncate the data for experimentation 147 | # 148 | # There are 4000 points, which can take a long time to run. By default, 149 | # it is truncated to 1000 samples. This can be changed using the -n 150 | # command-line argument. 151 | 152 | X = X[:options.n_samples] 153 | y = y[:options.n_samples] 154 | 155 | #---------------------------------------------------------------------- 156 | # Normalization: 157 | # 158 | # The results of the dimensionality reduction can depend heavily on the 159 | # data normalization. These can be commented or un-commented to try 160 | # l1 or l2 normalization. 161 | 162 | if options.normalization.lower() == 'none': 163 | pass 164 | elif options.normalization.lower() == 'l2': 165 | X = preprocessing.normalize(X, 'l2') 166 | elif options.normalization.lower() == 'l1': 167 | X = preprocessing.normalize(X, 'l1') 168 | else: 169 | raise ValueError("Unrecognized normalization: '%s'" % options.normalization) 170 | 171 | #====================================================================== 172 | # TODO: compute X_proj for each method. 173 | # In each of the below cases, you should compute a projection of the 174 | # data and store that projection in the matrix X_proj. 175 | # X_proj should have the same number of rows as X, and should have 176 | # at least 3 features. 177 | 178 | X_proj = None 179 | 180 | if options.method == 'pca': 181 | print "Performing PCA" 182 | # TODO: compute a RandomizedPCA projection of X with n_components >= 3 183 | 184 | elif options.method == 'lle': 185 | print "Performing LLE" 186 | # TODO: compute LLE on X with method='standard', and out_dim >= 3 187 | 188 | 189 | elif options.method == 'mlle': 190 | print "Performing MLLE" 191 | # TODO: compute LLE on X with method='modified' and out_dim >= 3 192 | 193 | elif options.method == 'isomap': 194 | print "Performing Isomap" 195 | # TODO: compute Isomap on X with out_dim >= 3 196 | 197 | else: 198 | raise ValueError("Unrecognized method: '%s'" % options.method) 199 | 200 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels) 201 | pl.show() 202 | 203 | -------------------------------------------------------------------------------- /doc/solutions/exercise_01.py: -------------------------------------------------------------------------------- 1 | """ 2 | Astronomy Tutorial: exercise 1 3 | 4 | Classification of photometric sources 5 | 6 | usage: python exercise_01.py datadir 7 | 8 | - datadir is $TUTORIAL_DIR/data/sdss_colors 9 | This directory should contain the files: 10 | - sdssdr6_colors_class_train.npy 11 | - sdssdr6_colors_class.200000.npy 12 | 13 | Description: 14 | In the tutorial, we used a Naive Bayes Classifier to separate Quasars 15 | And Stars. In this exercise, we will extend this classification scheme 16 | using Gaussian Mixture Models. 17 | 18 | The Gaussian Naive Bayes method starts by fitting an N-dimensional gaussian 19 | distribution to each class of data. When a test point is evaluated, the 20 | relative log-likelihood from each distribution is used to predict the most 21 | likely value. We're going to extend this by fitting a sum of gaussians to 22 | each distribution. 23 | 24 | There are several places in this file with code to be filled-in as part of 25 | the exercise. Each of these is labeled TODO below. 26 | """ 27 | import os, sys 28 | import numpy as np 29 | import pylab as pl 30 | from sklearn.mixture import gmm 31 | from sklearn import metrics 32 | 33 | try: 34 | datadir = sys.argv[1] 35 | except: 36 | print __doc__ 37 | sys.exit() 38 | 39 | #---------------------------------------------------------------------- 40 | # Load data files 41 | train_data = np.load(os.path.join(datadir, 42 | 'sdssdr6_colors_class_train.npy')) 43 | test_data = np.load(os.path.join(datadir, 44 | 'sdssdr6_colors_class.200000.npy')) 45 | 46 | # set the number of training points: using all points leads to a very 47 | # long running time. We'll start with 10000 training points. This 48 | # can be increased if desired. 49 | Ntrain = 10000 50 | #Ntrain = len(train_data) 51 | 52 | np.random.seed(0) 53 | np.random.shuffle(train_data) 54 | train_data = train_data[:Ntrain] 55 | 56 | #---------------------------------------------------------------------- 57 | # Split training data into training and cross-validation sets 58 | N_crossval = Ntrain / 5 59 | train_data = train_data[:-N_crossval] 60 | crossval_data = train_data[-N_crossval:] 61 | 62 | #---------------------------------------------------------------------- 63 | # Set up data 64 | # 65 | X_train = np.zeros((train_data.size, 4), dtype=float) 66 | X_train[:, 0] = train_data['u-g'] 67 | X_train[:, 1] = train_data['g-r'] 68 | X_train[:, 2] = train_data['r-i'] 69 | X_train[:, 3] = train_data['i-z'] 70 | y_train = (train_data['redshift'] > 0).astype(int) 71 | Ntrain = len(y_train) 72 | 73 | X_crossval = np.zeros((crossval_data.size, 4), dtype=float) 74 | X_crossval[:, 0] = crossval_data['u-g'] 75 | X_crossval[:, 1] = crossval_data['g-r'] 76 | X_crossval[:, 2] = crossval_data['r-i'] 77 | X_crossval[:, 3] = crossval_data['i-z'] 78 | y_crossval = (crossval_data['redshift'] > 0).astype(int) 79 | Ncrossval = len(y_crossval) 80 | 81 | #====================================================================== 82 | # Recreating Gaussian Naive Bayes 83 | # 84 | # Here we will use Gaussian Mixture Models to duplicate our Gaussian 85 | # Naive Bayes results from earlier. You'll create two sklearn.gmm.GMM() 86 | # classifier instances, named `clf_0` and `clf_1`. Each should be 87 | # initialized with a single component, and diagonal covariance. 88 | # (hint: look at the doc string for sklearn.gmm.GMM to see how to set 89 | # this up). The results should be compared to Gaussian Naive Bayes 90 | # to check if they're correct. 91 | # 92 | # Objects to create: 93 | # - clf_0 : trained on the portion of the training data with y == 0 94 | # - clf_1 : trained on the portion of the training data with y == 1 95 | 96 | #{{{ compute clf_0, clf_1 97 | clf_0 = gmm.GMM(1, 'diag') 98 | i0 = (y_train == 0) 99 | clf_0.fit(X_train[i0]) 100 | 101 | clf_1 = gmm.GMM(1, 'diag') 102 | i1 = (y_train == 1) 103 | clf_1.fit(X_train[i1]) 104 | #}}} 105 | 106 | # next we must construct the prior. The prior is the fraction of training 107 | # points of each type. 108 | # 109 | # variables to compute: 110 | # - prior0 : fraction of training points with y == 0 111 | # - prior1 : fraction of training points with y == 1 112 | 113 | #{{{ compute prior0, prior1 114 | num0 = i0.sum() 115 | num1 = i1.sum() 116 | 117 | prior0 = num0 / float(Ntrain) 118 | prior1 = num1 / float(Ntrain) 119 | #}}} 120 | 121 | # Now we use the prior and the classifiation to compute the log-likelihoods 122 | # of the cross-validation points. The log likelihood is given by 123 | # 124 | # logL(x) = clf.score(x) + log(prior) 125 | # 126 | # You can use the function np.log() to compute the logarithm of the prior. 127 | # variables to compute: 128 | # logL : array, shape = (2, Ncrossval) 129 | # logL[0] is the log-likelihood for y == 0 130 | # logL[1] is the log-likelihood for y == 1 131 | logL = None 132 | 133 | #{{{ compute logL 134 | logL = np.zeros((2, Ncrossval)) 135 | logL[0] = clf_0.score(X_crossval) + np.log(prior0) 136 | logL[1] = clf_1.score(X_crossval) + np.log(prior1) 137 | #}}} 138 | 139 | # the predicted value for each sample is the index with the largest 140 | # log-likelihood. 141 | y_pred = np.argmax(logL, 0) 142 | 143 | # now we print the results. We'll use the built-in classification 144 | # report function in sklearn.metrics. This computes the precision, 145 | # recall, and f1-score for each class. 146 | 147 | print "------------------------------------------------------------" 148 | print "One-component Gaussian Mixture:" 149 | print " results for cross-validation set:" 150 | print metrics.classification_report(y_crossval, y_pred, 151 | target_names=['stars', 'QSOs']) 152 | 153 | 154 | 155 | #---------------------------------------------------------------------- 156 | # Run Gaussian Naive Bayes to double-check that our results are correct. 157 | # Because of rounding errors, it will not be exact, but the results should 158 | # be very close. 159 | from sklearn.naive_bayes import GaussianNB 160 | gnb = GaussianNB() 161 | gnb.fit(X_train, y_train) 162 | y_pred = gnb.predict(X_crossval) 163 | 164 | print "------------------------------------------------------------" 165 | print "Gaussian Naive Bayes" 166 | print " results for cross-validation set:" 167 | print " (results should be within ~0.01 of above results)" 168 | print metrics.classification_report(y_crossval, y_pred, 169 | target_names=['stars', 'QSOs']) 170 | 171 | #====================================================================== 172 | # Parameter optimization: 173 | # 174 | # Now take some time to experiment with the covariance type and the 175 | # number of components, to see if you can optimize the F1 score 176 | # 177 | # Note that for a large number of components, the fit can take a long 178 | # time, and will be dependent on the starting position. Use the 179 | # documentation string of GMM to determine the options for covariance. 180 | # 181 | # It may be helpful to use only a subset of the training data while 182 | # experimenting with these parameter values. This is called 183 | # "Meta-parameter optimization". It can be accomplished automatically, 184 | # but here we are doing it by hand for learning purposes. 185 | y_pred = None 186 | 187 | #{{{ compute y_pred for cross-validation data 188 | clf_0 = gmm.GMM(5, 'full', random_state=0) 189 | i0 = (y_train == 0) 190 | clf_0.fit(X_train[i0]) 191 | 192 | clf_1 = gmm.GMM(5, 'full', random_state=0) 193 | i1 = (y_train == 1) 194 | clf_1.fit(X_train[i1]) 195 | 196 | logL = np.zeros((2, Ncrossval)) 197 | logL[0] = clf_0.score(X_crossval) + np.log(prior0) 198 | logL[1] = clf_1.score(X_crossval) + np.log(prior1) 199 | 200 | y_pred = np.argmax(logL, 0) 201 | #}}} 202 | 203 | print "------------------------------------------------------------" 204 | print "GMM with tweaked parameters:" 205 | print " results for cross-validation set" 206 | print metrics.classification_report(y_crossval, y_pred, 207 | target_names=['stars', 'QSOs']) 208 | 209 | #---------------------------------------------------------------------- 210 | # Test Data 211 | # once you have maximized the cross-validation, you can apply the estimator 212 | # to your test data, and check how it compares to the predicted results 213 | # from the researcher who compiled it. 214 | 215 | X_test = np.zeros((test_data.size, 4), dtype=float) 216 | X_test[:, 0] = test_data['u-g'] 217 | X_test[:, 1] = test_data['g-r'] 218 | X_test[:, 2] = test_data['r-i'] 219 | X_test[:, 3] = test_data['i-z'] 220 | y_pred_literature = (test_data['label'] == 0).astype(int) 221 | Ntest = len(y_pred_literature) 222 | 223 | # here you should compute y_pred for the test data, using the classifiers 224 | # clf_0 and clf_1 which you already trained above. 225 | 226 | y_pred = None 227 | 228 | #{{{ compute y_pred for test data 229 | logL = np.zeros((2, Ntest)) 230 | logL[0] = clf_0.score(X_test) + np.log(prior0) 231 | logL[1] = clf_1.score(X_test) + np.log(prior1) 232 | y_pred = np.argmax(logL, 0) 233 | #}}} 234 | 235 | print "------------------------------------------------------------" 236 | print "Comparison of current results with published results" 237 | print " results for test set" 238 | print " (treating published results as the 'true' result)" 239 | print metrics.classification_report(y_pred_literature, y_pred, 240 | target_names=['stars', 'QSOs']) 241 | -------------------------------------------------------------------------------- /doc/solutions/exercise_03.py: -------------------------------------------------------------------------------- 1 | """ 2 | Astronomy Tutorial: exercise 3 3 | 4 | Dimensionality reduction of stellar spectra 5 | 6 | Usage: python exercise_03.py datadir [-m method] [-k n_neigbors] 7 | [-n norm_type] [-N n_samples] 8 | [-s] 9 | 10 | - datadir is $TUTORIAL_DIR/data/sdss_photoz 11 | This directory should contain the file sdss_photoz.npy 12 | 13 | - method is one of [pca | lle | mlle | isomap]. If not specified, 14 | PCA will be performed 15 | 16 | - n_neighbors is an integer number of neighbors to use with manifold methods 17 | 18 | - norm_type is one of [none | l1 | l2]. It specifies how the data should 19 | be normalized. 20 | 21 | - n_samples is the number of samples used for the projection. Only 1000 22 | of the 4000 samples are used by default. 23 | 24 | - specifying -s shuffles the data. This can help test for stability of 25 | the reconstruction. 26 | 27 | Description: 28 | In this tutorial, we explore manifold learning techniques to visualize 4000 29 | SDSS spectral data. This is a much more exploratory exercise than the previous 30 | two. The goal is to determine how to best visualize this high-dimensional 31 | space. You will implement PCA, LLE, Modified LLE, and Isomap, for various 32 | data normalizations. The goal is to find the best visualization of the 33 | data, where "best" in this case is a qualitative measure of how well the 34 | different classes of points are separated in the projected space. 35 | 36 | To make experimentation more streamlined 37 | 38 | There are several places in this file with code to be filled-in as part of 39 | the exercise. Each of these is labeled TODO below. 40 | """ 41 | 42 | import os, sys 43 | import numpy as np 44 | 45 | import pylab as pl 46 | from matplotlib import ticker 47 | 48 | from sklearn import preprocessing 49 | from sklearn.decomposition import RandomizedPCA 50 | from sklearn.manifold import LocallyLinearEmbedding, Isomap 51 | 52 | #---------------------------------------------------------------------- 53 | # set up command-line option parser 54 | from optparse import OptionParser 55 | parser = OptionParser(usage=__doc__, 56 | version="%prog 1.0") 57 | parser.add_option("-m", "--method", 58 | dest="method", 59 | default='pca', 60 | help="Specify method to use: [pca | lle | mlle | isomap]") 61 | 62 | parser.add_option("-k", "--neighbors", 63 | dest="n_neighbors", 64 | type="int", 65 | default=15, 66 | help='Specify number of neighbors for manifold learning') 67 | 68 | parser.add_option("-N", "--normalization", 69 | dest="normalization", 70 | default="none", 71 | help="Specify normalization: [none | l1 | l2]") 72 | 73 | parser.add_option("-n", "--n_samples", 74 | dest="n_samples", 75 | type="int", 76 | default=1000, 77 | help="Specify number of samples to use, up to 4000 (default 1000)") 78 | 79 | parser.add_option("-s", "--shuffle", 80 | dest="shuffle", 81 | action="store_true", 82 | default=False, 83 | help="shuffle the data") 84 | 85 | 86 | options, args = parser.parse_args() 87 | 88 | if len(args) == 0: 89 | parser.error("Must specify a data directory") 90 | elif len(args) > 1: 91 | parser.error("Must specify a single data directory") 92 | 93 | datadir = args[0] 94 | 95 | print "data directory: %s" % datadir 96 | print " method = %s" % options.method 97 | print " n_neighbors = %i" % options.n_neighbors 98 | print " normalization = %s" % options.normalization 99 | print " n_samples: %i" % options.n_samples 100 | print " shuffle: %s" % options.shuffle 101 | 102 | 103 | def three_component_plot(c1, c2, c3, color, labels): 104 | pl.figure(figsize=(8,8)) 105 | kwargs = dict(s=4, lw=0, c=color, vmin=2, vmax=6) 106 | ax1 = pl.subplot(221) 107 | pl.scatter(c1, c2, **kwargs) 108 | pl.ylabel('component 2') 109 | 110 | ax2 = pl.subplot(223, sharex=ax1) 111 | pl.scatter(c1, c3, **kwargs) 112 | pl.xlabel('component 1') 113 | pl.ylabel('component 3') 114 | 115 | ax3 = pl.subplot(224, sharey=ax2) 116 | pl.scatter(c2, c3, **kwargs) 117 | pl.xlabel('component 2') 118 | 119 | for ax in (ax1, ax2, ax3): 120 | ax.xaxis.set_major_formatter(ticker.NullFormatter()) 121 | ax.yaxis.set_major_formatter(ticker.NullFormatter()) 122 | 123 | pl.subplots_adjust(hspace=0.05, wspace=0.05) 124 | 125 | format = ticker.FuncFormatter(lambda i, *args: labels[i]) 126 | pl.colorbar(ticks = range(2, 7), format=format, 127 | cax = pl.axes((0.52, 0.51, 0.02, 0.39))) 128 | pl.clim(1.5, 6.5) 129 | 130 | 131 | #---------------------------------------------------------------------- 132 | # Load data files 133 | data = np.load(os.path.join(datadir, 'spec4000_corrected.npz')) 134 | 135 | X = data['X'] 136 | y = data['y'] 137 | labels = data['labels'] 138 | 139 | if options.shuffle: 140 | i = np.arange(y.shape[0], dtype=int) 141 | np.random.shuffle(i) 142 | X = X[i] 143 | y = y[i] 144 | 145 | #---------------------------------------------------------------------- 146 | # truncate the data for experimentation 147 | # 148 | # There are 4000 points, which can take a long time to run. By default, 149 | # it is truncated to 1000 samples. This can be changed using the -n 150 | # command-line argument. 151 | 152 | X = X[:options.n_samples] 153 | y = y[:options.n_samples] 154 | 155 | #---------------------------------------------------------------------- 156 | # Normalization: 157 | # 158 | # The results of the dimensionality reduction can depend heavily on the 159 | # data normalization. These can be commented or un-commented to try 160 | # l1 or l2 normalization. 161 | 162 | if options.normalization.lower() == 'none': 163 | pass 164 | elif options.normalization.lower() == 'l2': 165 | X = preprocessing.normalize(X, 'l2') 166 | elif options.normalization.lower() == 'l1': 167 | X = preprocessing.normalize(X, 'l1') 168 | else: 169 | raise ValueError("Unrecognized normalization: '%s'" % options.normalization) 170 | 171 | #====================================================================== 172 | # TODO: compute X_proj for each method. 173 | # In each of the below cases, you should compute a projection of the 174 | # data and store that projection in the matrix X_proj. 175 | # X_proj should have the same number of rows as X, and should have 176 | # at least 3 features. 177 | 178 | X_proj = None 179 | 180 | if options.method == 'pca': 181 | print "Performing PCA" 182 | #{{{ compute a RandomizedPCA projection of X with n_components >= 3 183 | rpca = RandomizedPCA(n_components=3, random_state=0) 184 | X_proj = rpca.fit_transform(X) 185 | #}}} 186 | 187 | elif options.method == 'lle': 188 | print "Performing LLE" 189 | #{{{ compute LLE on X with method='standard', and out_dim >= 3 190 | lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors, 191 | out_dim=3, method='standard') 192 | X_proj = lle.fit_transform(X) 193 | #}}} 194 | 195 | 196 | elif options.method == 'mlle': 197 | print "Performing MLLE" 198 | #{{{ compute LLE on X with method='modified' and out_dim >= 3 199 | lle = LocallyLinearEmbedding(n_neighbors=options.n_neighbors, 200 | out_dim=3, method='modified') 201 | X_proj = lle.fit_transform(X) 202 | #}}} 203 | 204 | elif options.method == 'isomap': 205 | print "Performing Isomap" 206 | #{{{ compute Isomap on X with out_dim >= 3 207 | iso = Isomap(n_neighbors=options.n_neighbors, 208 | out_dim=3) 209 | X_proj = iso.fit_transform(X) 210 | #}}} 211 | 212 | else: 213 | raise ValueError("Unrecognized method: '%s'" % options.method) 214 | 215 | three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels) 216 | pl.show() 217 | 218 | -------------------------------------------------------------------------------- /doc/solutions/generate_skeletons.py: -------------------------------------------------------------------------------- 1 | """Generate skeletons from the example code""" 2 | import os 3 | 4 | exercise_dir = os.path.dirname(__file__) 5 | if exercise_dir == '': 6 | exercise_dir = '.' 7 | 8 | skeleton_dir = os.path.abspath(os.path.join(exercise_dir,'../skeletons/')) 9 | if not os.path.exists(skeleton_dir): 10 | os.makedirs(skeleton_dir) 11 | 12 | L = os.listdir(exercise_dir) 13 | 14 | for f in L: 15 | if not f.endswith('.py'): 16 | continue 17 | 18 | if f == os.path.basename(__file__): 19 | continue 20 | 21 | print "parsing %s" % f 22 | 23 | input_file = open(os.path.join(exercise_dir, f)) 24 | output_file = open(os.path.join(skeleton_dir, f), 'w') 25 | 26 | in_exercise_region = False 27 | 28 | for line in input_file: 29 | linestrip = line.strip() 30 | if linestrip.startswith('#{{{'): 31 | in_exercise_region = True 32 | message = linestrip.lstrip('#{{{') 33 | output_file.write(line.split('#')[0] + '# TODO: %s\n' % message) 34 | elif in_exercise_region: 35 | if '#}}}' in line: 36 | in_exercise_region = False 37 | else: 38 | output_file.write(line) 39 | 40 | output_file.close() 41 | -------------------------------------------------------------------------------- /doc/sphinxext/LICENSE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | The files 3 | - numpydoc.py 4 | - autosummary.py 5 | - autosummary_generate.py 6 | - docscrape.py 7 | - docscrape_sphinx.py 8 | - phantom_import.py 9 | have the following license: 10 | 11 | Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are 15 | met: 16 | 17 | 1. Redistributions of source code must retain the above copyright 18 | notice, this list of conditions and the following disclaimer. 19 | 2. Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in 21 | the documentation and/or other materials provided with the 22 | distribution. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 25 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 26 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, 28 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 29 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 32 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 33 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | POSSIBILITY OF SUCH DAMAGE. 35 | 36 | ------------------------------------------------------------------------------- 37 | The files 38 | - compiler_unparse.py 39 | - comment_eater.py 40 | - traitsdoc.py 41 | have the following license: 42 | 43 | This software is OSI Certified Open Source Software. 44 | OSI Certified is a certification mark of the Open Source Initiative. 45 | 46 | Copyright (c) 2006, Enthought, Inc. 47 | All rights reserved. 48 | 49 | Redistribution and use in source and binary forms, with or without 50 | modification, are permitted provided that the following conditions are met: 51 | 52 | * Redistributions of source code must retain the above copyright notice, this 53 | list of conditions and the following disclaimer. 54 | * Redistributions in binary form must reproduce the above copyright notice, 55 | this list of conditions and the following disclaimer in the documentation 56 | and/or other materials provided with the distribution. 57 | * Neither the name of Enthought, Inc. nor the names of its contributors may 58 | be used to endorse or promote products derived from this software without 59 | specific prior written permission. 60 | 61 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 62 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 63 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 64 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 65 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 66 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 67 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 68 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 69 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 70 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 71 | 72 | 73 | ------------------------------------------------------------------------------- 74 | The files 75 | - only_directives.py 76 | - plot_directive.py 77 | originate from Matplotlib (http://matplotlib.sf.net/) which has 78 | the following license: 79 | 80 | Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 81 | 82 | 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 83 | 84 | 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 85 | 86 | 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 87 | 88 | 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 89 | 90 | 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 91 | 92 | 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 93 | 94 | 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 95 | 96 | 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. 97 | 98 | -------------------------------------------------------------------------------- /doc/sphinxext/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests *.py 2 | include *.txt 3 | -------------------------------------------------------------------------------- /doc/sphinxext/README.txt: -------------------------------------------------------------------------------- 1 | ===================================== 2 | numpydoc -- Numpy's Sphinx extensions 3 | ===================================== 4 | 5 | Numpy's documentation uses several custom extensions to Sphinx. These 6 | are shipped in this ``numpydoc`` package, in case you want to make use 7 | of them in third-party projects. 8 | 9 | The following extensions are available: 10 | 11 | - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add 12 | the code description directives ``np-function``, ``np-cfunction``, etc. 13 | that support the Numpy docstring syntax. 14 | 15 | - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. 16 | 17 | - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` 18 | directive. Note that this implementation may still undergo severe 19 | changes or eventually be deprecated. 20 | 21 | - ``numpydoc.only_directives``: (DEPRECATED) 22 | 23 | - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. 24 | Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, 25 | and it the Sphinx 1.0 version is recommended over that included in 26 | Numpydoc. 27 | 28 | 29 | numpydoc 30 | ======== 31 | 32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings 33 | following the Numpy/Scipy format to a form palatable to Sphinx. 34 | 35 | Options 36 | ------- 37 | 38 | The following options can be set in conf.py: 39 | 40 | - numpydoc_use_plots: bool 41 | 42 | Whether to produce ``plot::`` directives for Examples sections that 43 | contain ``import matplotlib``. 44 | 45 | - numpydoc_show_class_members: bool 46 | 47 | Whether to show all members of a class in the Methods and Attributes 48 | sections automatically. 49 | 50 | - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) 51 | 52 | Whether to insert an edit link after docstrings. 53 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/sphinxext/numpy_ext/__init__.py -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/docscrape_sphinx.py: -------------------------------------------------------------------------------- 1 | import re 2 | import inspect 3 | import textwrap 4 | import pydoc 5 | import sphinx 6 | from docscrape import NumpyDocString 7 | from docscrape import FunctionDoc 8 | from docscrape import ClassDoc 9 | 10 | 11 | class SphinxDocString(NumpyDocString): 12 | def __init__(self, docstring, config=None): 13 | config = {} if config is None else config 14 | self.use_plots = config.get('use_plots', False) 15 | NumpyDocString.__init__(self, docstring, config=config) 16 | 17 | # string conversion routines 18 | def _str_header(self, name, symbol='`'): 19 | return ['.. rubric:: ' + name, ''] 20 | 21 | def _str_field_list(self, name): 22 | return [':' + name + ':'] 23 | 24 | def _str_indent(self, doc, indent=4): 25 | out = [] 26 | for line in doc: 27 | out += [' ' * indent + line] 28 | return out 29 | 30 | def _str_signature(self): 31 | return [''] 32 | if self['Signature']: 33 | return ['``%s``' % self['Signature']] + [''] 34 | else: 35 | return [''] 36 | 37 | def _str_summary(self): 38 | return self['Summary'] + [''] 39 | 40 | def _str_extended_summary(self): 41 | return self['Extended Summary'] + [''] 42 | 43 | def _str_param_list(self, name): 44 | out = [] 45 | if self[name]: 46 | out += self._str_field_list(name) 47 | out += [''] 48 | for param, param_type, desc in self[name]: 49 | out += self._str_indent(['**%s** : %s' % (param.strip(), 50 | param_type)]) 51 | out += [''] 52 | out += self._str_indent(desc, 8) 53 | out += [''] 54 | return out 55 | 56 | @property 57 | def _obj(self): 58 | if hasattr(self, '_cls'): 59 | return self._cls 60 | elif hasattr(self, '_f'): 61 | return self._f 62 | return None 63 | 64 | def _str_member_list(self, name): 65 | """ 66 | Generate a member listing, autosummary:: table where possible, 67 | and a table where not. 68 | 69 | """ 70 | out = [] 71 | if self[name]: 72 | out += ['.. rubric:: %s' % name, ''] 73 | prefix = getattr(self, '_name', '') 74 | 75 | if prefix: 76 | prefix = '~%s.' % prefix 77 | 78 | autosum = [] 79 | others = [] 80 | for param, param_type, desc in self[name]: 81 | param = param.strip() 82 | if not self._obj or hasattr(self._obj, param): 83 | autosum += [" %s%s" % (prefix, param)] 84 | else: 85 | others.append((param, param_type, desc)) 86 | 87 | if autosum: 88 | # GAEL: Toctree commented out below because it creates 89 | # hundreds of sphinx warnings 90 | # out += ['.. autosummary::', ' :toctree:', ''] 91 | out += ['.. autosummary::', ''] 92 | out += autosum 93 | 94 | if others: 95 | maxlen_0 = max([len(x[0]) for x in others]) 96 | maxlen_1 = max([len(x[1]) for x in others]) 97 | hdr = "=" * maxlen_0 + " " + "=" * maxlen_1 + " " + "=" * 10 98 | fmt = '%%%ds %%%ds ' % (maxlen_0, maxlen_1) 99 | n_indent = maxlen_0 + maxlen_1 + 4 100 | out += [hdr] 101 | for param, param_type, desc in others: 102 | out += [fmt % (param.strip(), param_type)] 103 | out += self._str_indent(desc, n_indent) 104 | out += [hdr] 105 | out += [''] 106 | return out 107 | 108 | def _str_section(self, name): 109 | out = [] 110 | if self[name]: 111 | out += self._str_header(name) 112 | out += [''] 113 | content = textwrap.dedent("\n".join(self[name])).split("\n") 114 | out += content 115 | out += [''] 116 | return out 117 | 118 | def _str_see_also(self, func_role): 119 | out = [] 120 | if self['See Also']: 121 | see_also = super(SphinxDocString, self)._str_see_also(func_role) 122 | out = ['.. seealso::', ''] 123 | out += self._str_indent(see_also[2:]) 124 | return out 125 | 126 | def _str_warnings(self): 127 | out = [] 128 | if self['Warnings']: 129 | out = ['.. warning::', ''] 130 | out += self._str_indent(self['Warnings']) 131 | return out 132 | 133 | def _str_index(self): 134 | idx = self['index'] 135 | out = [] 136 | if len(idx) == 0: 137 | return out 138 | 139 | out += ['.. index:: %s' % idx.get('default', '')] 140 | for section, references in idx.iteritems(): 141 | if section == 'default': 142 | continue 143 | elif section == 'refguide': 144 | out += [' single: %s' % (', '.join(references))] 145 | else: 146 | out += [' %s: %s' % (section, ','.join(references))] 147 | return out 148 | 149 | def _str_references(self): 150 | out = [] 151 | if self['References']: 152 | out += self._str_header('References') 153 | if isinstance(self['References'], str): 154 | self['References'] = [self['References']] 155 | out.extend(self['References']) 156 | out += [''] 157 | # Latex collects all references to a separate bibliography, 158 | # so we need to insert links to it 159 | if sphinx.__version__ >= "0.6": 160 | out += ['.. only:: latex', ''] 161 | else: 162 | out += ['.. latexonly::', ''] 163 | items = [] 164 | for line in self['References']: 165 | m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) 166 | if m: 167 | items.append(m.group(1)) 168 | out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] 169 | return out 170 | 171 | def _str_examples(self): 172 | examples_str = "\n".join(self['Examples']) 173 | 174 | if (self.use_plots and 'import matplotlib' in examples_str 175 | and 'plot::' not in examples_str): 176 | out = [] 177 | out += self._str_header('Examples') 178 | out += ['.. plot::', ''] 179 | out += self._str_indent(self['Examples']) 180 | out += [''] 181 | return out 182 | else: 183 | return self._str_section('Examples') 184 | 185 | def __str__(self, indent=0, func_role="obj"): 186 | out = [] 187 | out += self._str_signature() 188 | out += self._str_index() + [''] 189 | out += self._str_summary() 190 | out += self._str_extended_summary() 191 | for param_list in ('Parameters', 'Returns', 'Raises'): 192 | out += self._str_param_list(param_list) 193 | out += self._str_warnings() 194 | out += self._str_see_also(func_role) 195 | out += self._str_section('Notes') 196 | out += self._str_references() 197 | out += self._str_examples() 198 | for param_list in ('Attributes', 'Methods'): 199 | out += self._str_member_list(param_list) 200 | out = self._str_indent(out, indent) 201 | return '\n'.join(out) 202 | 203 | 204 | class SphinxFunctionDoc(SphinxDocString, FunctionDoc): 205 | def __init__(self, obj, doc=None, config={}): 206 | self.use_plots = config.get('use_plots', False) 207 | FunctionDoc.__init__(self, obj, doc=doc, config=config) 208 | 209 | 210 | class SphinxClassDoc(SphinxDocString, ClassDoc): 211 | def __init__(self, obj, doc=None, func_doc=None, config={}): 212 | self.use_plots = config.get('use_plots', False) 213 | ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) 214 | 215 | 216 | class SphinxObjDoc(SphinxDocString): 217 | def __init__(self, obj, doc=None, config=None): 218 | self._f = obj 219 | SphinxDocString.__init__(self, doc, config=config) 220 | 221 | 222 | def get_doc_object(obj, what=None, doc=None, config={}): 223 | if what is None: 224 | if inspect.isclass(obj): 225 | what = 'class' 226 | elif inspect.ismodule(obj): 227 | what = 'module' 228 | elif callable(obj): 229 | what = 'function' 230 | else: 231 | what = 'object' 232 | if what == 'class': 233 | return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, 234 | config=config) 235 | elif what in ('function', 'method'): 236 | return SphinxFunctionDoc(obj, doc=doc, config=config) 237 | else: 238 | if doc is None: 239 | doc = pydoc.getdoc(obj) 240 | return SphinxObjDoc(obj, doc, config=config) 241 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext/numpydoc.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======== 3 | numpydoc 4 | ======== 5 | 6 | Sphinx extension that handles docstrings in the Numpy standard format. [1] 7 | 8 | It will: 9 | 10 | - Convert Parameters etc. sections to field lists. 11 | - Convert See Also section to a See also entry. 12 | - Renumber references. 13 | - Extract the signature from the docstring, if it can't be determined 14 | otherwise. 15 | 16 | .. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard 17 | 18 | """ 19 | 20 | import os 21 | import re 22 | import pydoc 23 | from docscrape_sphinx import get_doc_object 24 | from docscrape_sphinx import SphinxDocString 25 | from sphinx.util.compat import Directive 26 | import inspect 27 | 28 | 29 | def mangle_docstrings(app, what, name, obj, options, lines, 30 | reference_offset=[0]): 31 | 32 | cfg = dict(use_plots=app.config.numpydoc_use_plots, 33 | show_class_members=app.config.numpydoc_show_class_members) 34 | 35 | if what == 'module': 36 | # Strip top title 37 | title_re = re.compile(ur'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*', 38 | re.I | re.S) 39 | lines[:] = title_re.sub(u'', u"\n".join(lines)).split(u"\n") 40 | else: 41 | doc = get_doc_object(obj, what, u"\n".join(lines), config=cfg) 42 | lines[:] = unicode(doc).split(u"\n") 43 | 44 | if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ 45 | obj.__name__: 46 | if hasattr(obj, '__module__'): 47 | v = dict(full_name=u"%s.%s" % (obj.__module__, obj.__name__)) 48 | else: 49 | v = dict(full_name=obj.__name__) 50 | lines += [u'', u'.. htmlonly::', ''] 51 | lines += [u' %s' % x for x in 52 | (app.config.numpydoc_edit_link % v).split("\n")] 53 | 54 | # replace reference numbers so that there are no duplicates 55 | references = [] 56 | for line in lines: 57 | line = line.strip() 58 | m = re.match(ur'^.. \[([a-z0-9_.-])\]', line, re.I) 59 | if m: 60 | references.append(m.group(1)) 61 | 62 | # start renaming from the longest string, to avoid overwriting parts 63 | references.sort(key=lambda x: -len(x)) 64 | if references: 65 | for i, line in enumerate(lines): 66 | for r in references: 67 | if re.match(ur'^\d+$', r): 68 | new_r = u"R%d" % (reference_offset[0] + int(r)) 69 | else: 70 | new_r = u"%s%d" % (r, reference_offset[0]) 71 | lines[i] = lines[i].replace(u'[%s]_' % r, 72 | u'[%s]_' % new_r) 73 | lines[i] = lines[i].replace(u'.. [%s]' % r, 74 | u'.. [%s]' % new_r) 75 | 76 | reference_offset[0] += len(references) 77 | 78 | 79 | def mangle_signature(app, what, name, obj, 80 | options, sig, retann): 81 | # Do not try to inspect classes that don't define `__init__` 82 | if (inspect.isclass(obj) and 83 | (not hasattr(obj, '__init__') or 84 | 'initializes x; see ' in pydoc.getdoc(obj.__init__))): 85 | return '', '' 86 | 87 | if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')): 88 | return 89 | if not hasattr(obj, '__doc__'): 90 | return 91 | 92 | doc = SphinxDocString(pydoc.getdoc(obj)) 93 | if doc['Signature']: 94 | sig = re.sub(u"^[^(]*", u"", doc['Signature']) 95 | return sig, u'' 96 | 97 | 98 | def setup(app, get_doc_object_=get_doc_object): 99 | global get_doc_object 100 | get_doc_object = get_doc_object_ 101 | 102 | app.connect('autodoc-process-docstring', mangle_docstrings) 103 | app.connect('autodoc-process-signature', mangle_signature) 104 | app.add_config_value('numpydoc_edit_link', None, False) 105 | app.add_config_value('numpydoc_use_plots', None, False) 106 | app.add_config_value('numpydoc_show_class_members', True, True) 107 | 108 | # Extra mangling domains 109 | app.add_domain(NumpyPythonDomain) 110 | app.add_domain(NumpyCDomain) 111 | 112 | #----------------------------------------------------------------------------- 113 | # Docstring-mangling domains 114 | #----------------------------------------------------------------------------- 115 | 116 | from docutils.statemachine import ViewList 117 | from sphinx.domains.c import CDomain 118 | from sphinx.domains.python import PythonDomain 119 | 120 | 121 | class ManglingDomainBase(object): 122 | directive_mangling_map = {} 123 | 124 | def __init__(self, *a, **kw): 125 | super(ManglingDomainBase, self).__init__(*a, **kw) 126 | self.wrap_mangling_directives() 127 | 128 | def wrap_mangling_directives(self): 129 | for name, objtype in self.directive_mangling_map.items(): 130 | self.directives[name] = wrap_mangling_directive( 131 | self.directives[name], objtype) 132 | 133 | 134 | class NumpyPythonDomain(ManglingDomainBase, PythonDomain): 135 | name = 'np' 136 | directive_mangling_map = { 137 | 'function': 'function', 138 | 'class': 'class', 139 | 'exception': 'class', 140 | 'method': 'function', 141 | 'classmethod': 'function', 142 | 'staticmethod': 'function', 143 | 'attribute': 'attribute', 144 | } 145 | 146 | 147 | class NumpyCDomain(ManglingDomainBase, CDomain): 148 | name = 'np-c' 149 | directive_mangling_map = { 150 | 'function': 'function', 151 | 'member': 'attribute', 152 | 'macro': 'function', 153 | 'type': 'class', 154 | 'var': 'object', 155 | } 156 | 157 | 158 | def wrap_mangling_directive(base_directive, objtype): 159 | class directive(base_directive): 160 | def run(self): 161 | env = self.state.document.settings.env 162 | 163 | name = None 164 | if self.arguments: 165 | m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) 166 | name = m.group(2).strip() 167 | 168 | if not name: 169 | name = self.arguments[0] 170 | 171 | lines = list(self.content) 172 | mangle_docstrings(env.app, objtype, name, None, None, lines) 173 | self.content = ViewList(lines, self.content.parent) 174 | 175 | return base_directive.run(self) 176 | 177 | return directive 178 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext_old/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astroML/sklearn_tutorial/0909361122d5b96379007516b55c6248afa86cfc/doc/sphinxext/numpy_ext_old/__init__.py -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext_old/docscrape_sphinx.py: -------------------------------------------------------------------------------- 1 | import re 2 | import inspect 3 | import textwrap 4 | import pydoc 5 | 6 | from docscrape import NumpyDocString 7 | from docscrape FunctionDoc 8 | from docscrape ClassDoc 9 | 10 | 11 | class SphinxDocString(NumpyDocString): 12 | # string conversion routines 13 | def _str_header(self, name, symbol='`'): 14 | return ['.. rubric:: ' + name, ''] 15 | 16 | def _str_field_list(self, name): 17 | return [':' + name + ':'] 18 | 19 | def _str_indent(self, doc, indent=4): 20 | out = [] 21 | for line in doc: 22 | out += [' ' * indent + line] 23 | return out 24 | 25 | def _str_signature(self): 26 | return [''] 27 | if self['Signature']: 28 | return ['``%s``' % self['Signature']] + [''] 29 | else: 30 | return [''] 31 | 32 | def _str_summary(self): 33 | return self['Summary'] + [''] 34 | 35 | def _str_extended_summary(self): 36 | return self['Extended Summary'] + [''] 37 | 38 | def _str_param_list(self, name): 39 | out = [] 40 | if self[name]: 41 | out += self._str_field_list(name) 42 | out += [''] 43 | for param, param_type, desc in self[name]: 44 | out += self._str_indent(['**%s** : %s' % (param.strip(), 45 | param_type)]) 46 | out += [''] 47 | out += self._str_indent(desc, 8) 48 | out += [''] 49 | return out 50 | 51 | def _str_section(self, name): 52 | out = [] 53 | if self[name]: 54 | out += self._str_header(name) 55 | out += [''] 56 | content = textwrap.dedent("\n".join(self[name])).split("\n") 57 | out += content 58 | out += [''] 59 | return out 60 | 61 | def _str_see_also(self, func_role): 62 | out = [] 63 | if self['See Also']: 64 | see_also = super(SphinxDocString, self)._str_see_also(func_role) 65 | out = ['.. seealso::', ''] 66 | out += self._str_indent(see_also[2:]) 67 | return out 68 | 69 | def _str_warnings(self): 70 | out = [] 71 | if self['Warnings']: 72 | out = ['.. warning::', ''] 73 | out += self._str_indent(self['Warnings']) 74 | return out 75 | 76 | def _str_index(self): 77 | idx = self['index'] 78 | out = [] 79 | if len(idx) == 0: 80 | return out 81 | 82 | out += ['.. index:: %s' % idx.get('default', '')] 83 | for section, references in idx.iteritems(): 84 | if section == 'default': 85 | continue 86 | elif section == 'refguide': 87 | out += [' single: %s' % (', '.join(references))] 88 | else: 89 | out += [' %s: %s' % (section, ','.join(references))] 90 | return out 91 | 92 | def _str_references(self): 93 | out = [] 94 | if self['References']: 95 | out += self._str_header('References') 96 | if isinstance(self['References'], str): 97 | self['References'] = [self['References']] 98 | out.extend(self['References']) 99 | out += [''] 100 | return out 101 | 102 | def __str__(self, indent=0, func_role="obj"): 103 | out = [] 104 | out += self._str_signature() 105 | out += self._str_index() + [''] 106 | out += self._str_summary() 107 | out += self._str_extended_summary() 108 | for param_list in ('Parameters', 'Attributes', 'Methods', 109 | 'Returns', 'Raises'): 110 | out += self._str_param_list(param_list) 111 | out += self._str_warnings() 112 | out += self._str_see_also(func_role) 113 | out += self._str_section('Notes') 114 | out += self._str_references() 115 | out += self._str_section('Examples') 116 | out = self._str_indent(out, indent) 117 | return '\n'.join(out) 118 | 119 | 120 | class SphinxFunctionDoc(SphinxDocString, FunctionDoc): 121 | pass 122 | 123 | 124 | class SphinxClassDoc(SphinxDocString, ClassDoc): 125 | pass 126 | 127 | 128 | def get_doc_object(obj, what=None): 129 | if what is None: 130 | if inspect.isclass(obj): 131 | what = 'class' 132 | elif inspect.ismodule(obj): 133 | what = 'module' 134 | elif callable(obj): 135 | what = 'function' 136 | else: 137 | what = 'object' 138 | if what == 'class': 139 | return SphinxClassDoc(obj, '', func_doc=SphinxFunctionDoc) 140 | elif what in ('function', 'method'): 141 | return SphinxFunctionDoc(obj, '') 142 | else: 143 | return SphinxDocString(pydoc.getdoc(obj)) 144 | -------------------------------------------------------------------------------- /doc/sphinxext/numpy_ext_old/numpydoc.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======== 3 | numpydoc 4 | ======== 5 | 6 | Sphinx extension that handles docstrings in the Numpy standard format. [1] 7 | 8 | It will: 9 | 10 | - Convert Parameters etc. sections to field lists. 11 | - Convert See Also section to a See also entry. 12 | - Renumber references. 13 | - Extract the signature from the docstring, if it can't be determined otherwise. 14 | 15 | .. [1] http://projects.scipy.org/scipy/numpy/wiki/CodingStyleGuidelines#docstring-standard 16 | 17 | """ 18 | 19 | import os 20 | import re 21 | import pydoc 22 | import inspect 23 | 24 | from docscrape_sphinx import get_doc_object 25 | from docscrape_sphinx import SphinxDocString 26 | 27 | 28 | def mangle_docstrings(app, what, name, obj, options, lines, 29 | reference_offset=[0]): 30 | if what == 'module': 31 | # Strip top title 32 | title_re = re.compile(r'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*', 33 | re.I | re.S) 34 | lines[:] = title_re.sub('', "\n".join(lines)).split("\n") 35 | else: 36 | doc = get_doc_object(obj, what) 37 | lines[:] = str(doc).split("\n") 38 | 39 | if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ 40 | obj.__name__: 41 | v = dict(full_name=obj.__name__) 42 | lines += [''] + (app.config.numpydoc_edit_link % v).split("\n") 43 | 44 | # replace reference numbers so that there are no duplicates 45 | references = [] 46 | for l in lines: 47 | l = l.strip() 48 | if l.startswith('.. ['): 49 | try: 50 | references.append(int(l[len('.. ['):l.index(']')])) 51 | except ValueError: 52 | print "WARNING: invalid reference in %s docstring" % name 53 | 54 | # Start renaming from the biggest number, otherwise we may 55 | # overwrite references. 56 | references.sort() 57 | if references: 58 | for i, line in enumerate(lines): 59 | for r in references: 60 | new_r = reference_offset[0] + r 61 | lines[i] = lines[i].replace('[%d]_' % r, 62 | '[%d]_' % new_r) 63 | lines[i] = lines[i].replace('.. [%d]' % r, 64 | '.. [%d]' % new_r) 65 | 66 | reference_offset[0] += len(references) 67 | 68 | 69 | def mangle_signature(app, what, name, obj, options, sig, retann): 70 | # Do not try to inspect classes that don't define `__init__` 71 | if (inspect.isclass(obj) and 72 | 'initializes x; see ' in pydoc.getdoc(obj.__init__)): 73 | return '', '' 74 | 75 | if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')): 76 | return 77 | if not hasattr(obj, '__doc__'): 78 | return 79 | 80 | doc = SphinxDocString(pydoc.getdoc(obj)) 81 | if doc['Signature']: 82 | sig = re.sub("^[^(]*", "", doc['Signature']) 83 | return sig, '' 84 | 85 | 86 | def initialize(app): 87 | try: 88 | app.connect('autodoc-process-signature', mangle_signature) 89 | except: 90 | monkeypatch_sphinx_ext_autodoc() 91 | 92 | 93 | def setup(app, get_doc_object_=get_doc_object): 94 | global get_doc_object 95 | get_doc_object = get_doc_object_ 96 | 97 | app.connect('autodoc-process-docstring', mangle_docstrings) 98 | app.connect('builder-inited', initialize) 99 | app.add_config_value('numpydoc_edit_link', None, True) 100 | 101 | #------------------------------------------------------------------------------ 102 | # Monkeypatch sphinx.ext.autodoc to accept argspecless autodocs (Sphinx < 0.5) 103 | #------------------------------------------------------------------------------ 104 | 105 | 106 | def monkeypatch_sphinx_ext_autodoc(): 107 | global _original_format_signature 108 | import sphinx.ext.autodoc 109 | 110 | if sphinx.ext.autodoc.format_signature is our_format_signature: 111 | return 112 | 113 | print "[numpydoc] Monkeypatching sphinx.ext.autodoc ..." 114 | _original_format_signature = sphinx.ext.autodoc.format_signature 115 | sphinx.ext.autodoc.format_signature = our_format_signature 116 | 117 | 118 | def our_format_signature(what, obj): 119 | r = mangle_signature(None, what, None, obj, None, None, None) 120 | if r is not None: 121 | return r[0] 122 | else: 123 | return _original_format_signature(what, obj) 124 | -------------------------------------------------------------------------------- /doc/templates/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | .. automethod:: __init__ 10 | {% endblock %} 11 | 12 | 13 | -------------------------------------------------------------------------------- /doc/templates/function.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autofunction:: {{ objname }} 7 | 8 | 9 | -------------------------------------------------------------------------------- /doc/themes/scikit-learn/layout.html: -------------------------------------------------------------------------------- 1 | {# 2 | scikit-learn/layout.html 3 | ~~~~~~~~~~~~~~~~~ 4 | 5 | Layout for scikit-learn, after a design made by Angel Soler 6 | (http://webylimonada.org) 7 | 8 | Update: Collapsable sidebar added - 13/03/2012 - Jaques Grobler 9 | Update: Next-page button added - 16/03/2012 - Jaques Grobler 10 | 11 | 12 | :copyright: Fabian Pedregosa 13 | :license: BSD 14 | #} 15 | {% extends "basic/layout.html" %} 16 | 17 | {% if theme_collapsiblesidebar|tobool %} 18 | {% set script_files = script_files + ['_static/sidebar.js'] %} 19 | {% endif %} 20 | 21 | {% block extrahead %} 22 | 23 | 24 | 37 | {% endblock %} 38 | 39 | {%- if pagename == 'index' %} 40 | {% set title = 'Machine Learning for Astronomy with Scikit-learn' %} 41 | {%- endif %} 42 | 43 | {% block header %} 44 | {%- if theme_oldversion == true %} 45 |
46 |

Warning: This documentation is 47 | for {{project}} version {{ release|e }}. 48 | — 49 | Latest stable version

50 |
51 | {%- endif %} 52 | 53 |
54 |
55 | {%- if logo %} 56 | 60 | {%- endif %} 61 | 62 | {%- block navbar -%} 63 | 94 | {%- endblock -%} 95 | 96 | 97 |
98 |
99 | {% endblock %} 100 | 101 | {% block content %} 102 |
103 | 104 |
105 |
106 | {%- if pagename != 'index' %} 107 | {%- if parents %} 108 |
109 | {% else %} 110 | 158 | {%- endif %} 159 | 160 | {%- if (pagename != 'index') %} 161 |

{{ _('This page') }}

162 | {{ toc }} 163 | {% else %} 164 |

News

165 |

scikit-learn 0.12 was released September 2012. Find out more at 166 | http://scikit-learn.org.

167 |

astroML 0.1 was released October 2012. Find out more at 168 | http://astroML.github.com.

169 | 170 |

Video Links

171 |

PyData 2012: 75-minute version of this tutorial

172 |

Scipy 2012: a 3-hour version of this tutorial

173 |

PyData NYC 2012: 45-minute 174 | version of this tutorial

175 | 176 |

Licensing

177 |

All material Open source: BSD license (3 clause).

178 | 179 |

About

180 |

Authors

181 | 182 | {% endif %} 183 |

Giving credit

184 |

Please consider citing the 185 | scikit-learn if you use it.

186 | 187 |
188 |
189 | 190 | 191 |
192 | {%- block document %} 193 | {{ super() }} 194 | {%- endblock %} 195 |
196 |
197 |
198 | 199 | {% endblock %} 200 | 201 | 202 | {% block relbar1 %}{% endblock %} 203 | {% block relbar2 %}{% endblock %} 204 | 205 | 206 | {%- block footer %} 207 | 228 | 229 | {%- if pagename != 'index' %} 230 | {%- if parents %} 231 |
232 | {% else %} 233 |
234 | {% endif %} 235 | 239 | {%- for rellink in rellinks|reverse %} 240 | 245 | {%- endfor %} 246 | {% endif %} 247 |
248 | 258 | {%- endblock %} 259 | 260 | 261 | -------------------------------------------------------------------------------- /doc/themes/scikit-learn/static/sidebar.js: -------------------------------------------------------------------------------- 1 | /* 2 | * sidebar.js 3 | * ~~~~~~~~~~ 4 | * 5 | * This script makes the Sphinx sidebar collapsible. 6 | * 7 | * .sphinxsidebar contains .sphinxsidebarwrapper. This script adds 8 | * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton 9 | * used to collapse and expand the sidebar. 10 | * 11 | * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden 12 | * and the width of the sidebar and the margin-left of the document 13 | * are decreased. When the sidebar is expanded the opposite happens. 14 | * This script saves a per-browser/per-session cookie used to 15 | * remember the position of the sidebar among the pages. 16 | * Once the browser is closed the cookie is deleted and the position 17 | * reset to the default (expanded). 18 | * 19 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 20 | * :license: BSD, see LICENSE for details. 21 | * 22 | */ 23 | 24 | $(function() { 25 | // global elements used by the functions. 26 | // the 'sidebarbutton' element is defined as global after its 27 | // creation, in the add_sidebar_button function 28 | var bodywrapper = $('.bodywrapper'); 29 | var sidebar = $('.sphinxsidebar'); 30 | var sidebarwrapper = $('.sphinxsidebarwrapper'); 31 | 32 | // for some reason, the document has no sidebar; do not run into errors 33 | if (!sidebar.length) return; 34 | 35 | // original margin-left of the bodywrapper and width of the sidebar 36 | // with the sidebar expanded 37 | var bw_margin_expanded = bodywrapper.css('margin-left'); 38 | var ssb_width_expanded = sidebar.width(); 39 | 40 | // margin-left of the bodywrapper and width of the sidebar 41 | // with the sidebar collapsed 42 | var bw_margin_collapsed = '-190px'; 43 | var ssb_width_collapsed = '1.0em'; 44 | 45 | // colors used by the current theme 46 | var dark_color = $('.related').css('background-color'); 47 | var light_color = $('.footer').css('color'); 48 | 49 | function sidebar_is_collapsed() { 50 | return sidebarwrapper.is(':not(:visible)'); 51 | } 52 | 53 | function toggle_sidebar() { 54 | if (sidebar_is_collapsed()) 55 | expand_sidebar(); 56 | else 57 | collapse_sidebar(); 58 | } 59 | 60 | function collapse_sidebar() { 61 | sidebarwrapper.hide(); 62 | sidebar.css('width', ssb_width_collapsed); 63 | bodywrapper.css('margin-left', bw_margin_collapsed); 64 | sidebarbutton.css({ 65 | 'margin-left': '0', 66 | 'height': bodywrapper.height() 67 | }); 68 | sidebarbutton.find('span').text('»'); 69 | sidebarbutton.attr('title', _('Expand sidebar')); 70 | document.cookie = 'sidebar=collapsed'; 71 | } 72 | 73 | function expand_sidebar() { 74 | bodywrapper.css('margin-left', bw_margin_expanded); 75 | sidebar.css('width', ssb_width_expanded); 76 | sidebarwrapper.show(); 77 | sidebarbutton.css({ 78 | 'margin-left': ssb_width_expanded-13, 79 | 'height': bodywrapper.height() 80 | }); 81 | sidebarbutton.find('span').text('«'); 82 | sidebarbutton.attr('title', _('Collapse sidebar')); 83 | document.cookie = 'sidebar=expanded'; 84 | } 85 | 86 | function add_sidebar_button() { 87 | sidebarwrapper.css({ 88 | 'float': 'left' , 89 | 'margin-right': '0', 90 | 'width': ssb_width_expanded - 13 91 | }); 92 | // create the button 93 | sidebar.append( 94 | '
«
' 95 | ); 96 | var sidebarbutton = $('#sidebarbutton'); 97 | light_color = sidebarbutton.css('background-color'); 98 | // find the height of the viewport to center the '<<' in the page 99 | var viewport_height; 100 | if (window.innerHeight) 101 | viewport_height = window.innerHeight; 102 | else 103 | viewport_height = $(window).height(); 104 | sidebarbutton.find('span').css({ 105 | 'display': 'block', 106 | 'margin-top': (viewport_height - sidebar.position().top + 60) / 2 107 | }); 108 | 109 | sidebarbutton.click(toggle_sidebar); 110 | sidebarbutton.attr('title', _('Collapse sidebar')); 111 | sidebarbutton.css({ 112 | 'border-left': '1px solid ' + dark_color, 113 | 'border-top-left-radius' : '15px', 114 | 'font-size': '1.2em', 115 | 'cursor': 'pointer', 116 | 'height': bodywrapper.height(), 117 | 'padding-top': '1px', 118 | 'margin-left': ssb_width_expanded - 12 119 | }); 120 | 121 | sidebarbutton.hover( 122 | function () { 123 | $(this).css('background-color', '#D0D0D0'); 124 | }, 125 | function () { 126 | $(this).css('background-color', '#F0F0F0'); 127 | } 128 | ); 129 | } 130 | 131 | function set_position_from_cookie() { 132 | if (!document.cookie) 133 | return; 134 | var items = document.cookie.split(';'); 135 | for(var k=0; k 20 33 | radius[far_pts] *= 1.2 34 | radius[~far_pts] *= 1.1 35 | 36 | theta = np.random.random(Npts) * np.pi * 2 37 | 38 | data = np.empty((Npts, 2)) 39 | data[:, 0] = radius * np.cos(theta) 40 | data[:, 1] = radius * np.sin(theta) 41 | 42 | labels = np.ones(Npts) 43 | labels[far_pts] = -1 44 | 45 | return data, labels 46 | 47 | #------------------------------------------------------------ 48 | # Linear model 49 | X, y = linear_model() 50 | clf = svm.SVC(kernel='linear', 51 | gamma=0.01, coef0=0, degree=3) 52 | clf.fit(X, y) 53 | 54 | fig = pl.figure() 55 | ax = pl.subplot(111, xticks=[], yticks=[]) 56 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=pl.cm.bone) 57 | 58 | ax.scatter(clf.support_vectors_[:, 0], 59 | clf.support_vectors_[:, 1], 60 | s=80, edgecolors="k", facecolors="none") 61 | 62 | delta = 1 63 | y_min, y_max = -50, 50 64 | x_min, x_max = -50, 50 65 | x = np.arange(x_min, x_max + delta, delta) 66 | y = np.arange(y_min, y_max + delta, delta) 67 | X1, X2 = np.meshgrid(x, y) 68 | Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()]) 69 | Z = Z.reshape(X1.shape) 70 | 71 | levels = [-1.0, 0.0, 1.0] 72 | linestyles = ['dashed', 'solid', 'dashed'] 73 | colors = 'k' 74 | ax.contour(X1, X2, Z, levels, 75 | colors=colors, 76 | linestyles=linestyles) 77 | 78 | 79 | #------------------------------------------------------------ 80 | # RBF model 81 | X, y = nonlinear_model() 82 | clf = svm.SVC(kernel='rbf', 83 | gamma=0.001, coef0=0, degree=3) 84 | clf.fit(X, y) 85 | 86 | fig = pl.figure() 87 | ax = pl.subplot(111, xticks=[], yticks=[]) 88 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=pl.cm.bone, zorder=2) 89 | 90 | ax.scatter(clf.support_vectors_[:, 0], 91 | clf.support_vectors_[:, 1], 92 | s=80, edgecolors="k", facecolors="none") 93 | 94 | delta = 1 95 | y_min, y_max = -50, 50 96 | x_min, x_max = -50, 50 97 | x = np.arange(x_min, x_max + delta, delta) 98 | y = np.arange(y_min, y_max + delta, delta) 99 | X1, X2 = np.meshgrid(x, y) 100 | Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()]) 101 | Z = Z.reshape(X1.shape) 102 | 103 | levels = [-1.0, 0.0, 1.0] 104 | linestyles = ['dashed', 'solid', 'dashed'] 105 | colors = 'k' 106 | 107 | ax.contourf(X1, X2, Z, 10, 108 | cmap=matplotlib.cm.bone, 109 | origin='lower', 110 | alpha=0.85, zorder=1) 111 | ax.contour(X1, X2, Z, [0.0], 112 | colors='k', 113 | linestyles=['solid'], zorder=1) 114 | 115 | pl.show() 116 | 117 | -------------------------------------------------------------------------------- /examples/plot_iris_projections.py: -------------------------------------------------------------------------------- 1 | """ 2 | Iris Projections 3 | ---------------- 4 | 5 | This code generates the Iris projection example plots found in the tutorial 6 | """ 7 | 8 | from itertools import cycle 9 | import pylab as pl 10 | 11 | from sklearn.datasets import load_iris 12 | from sklearn.decomposition import PCA 13 | 14 | 15 | def plot_2D(data, target, target_names): 16 | colors = cycle('rgbcmykw') 17 | target_ids = range(len(target_names)) 18 | pl.figure() 19 | for i, c, label in zip(target_ids, colors, target_names): 20 | pl.plot(data[target == i, 0], 21 | data[target == i, 1], 'o', 22 | c=c, label=label) 23 | pl.legend(target_names) 24 | 25 | #---------------------------------------------------------------------- 26 | # Load iris data 27 | iris = load_iris() 28 | X, y = iris.data, iris.target 29 | 30 | 31 | #---------------------------------------------------------------------- 32 | # First figure: PCA 33 | pca = PCA(n_components=2, whiten=True).fit(X) 34 | X_pca = pca.transform(X) 35 | plot_2D(X_pca, iris.target, iris.target_names) 36 | 37 | 38 | #---------------------------------------------------------------------- 39 | # Second figure: Kmeans labels 40 | from sklearn.cluster import KMeans 41 | from numpy.random import RandomState 42 | rng = RandomState(42) 43 | kmeans = KMeans(3, random_state=rng).fit(X_pca) 44 | plot_2D(X_pca, kmeans.labels_, ["c0", "c1", "c2"]) 45 | 46 | 47 | pl.show() 48 | -------------------------------------------------------------------------------- /examples/plot_python_101.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic numerics and plotting with Python 3 | ======================================== 4 | 5 | """ 6 | 7 | # import numpy: the module providing numerical arrays 8 | import numpy as np 9 | t = np.linspace(1, 10, 2000) 10 | 11 | # import pylab: the module for scientific plotting 12 | import pylab as pl 13 | pl.plot(t, np.cos(t)) 14 | -------------------------------------------------------------------------------- /examples/plot_sdss_filters.py: -------------------------------------------------------------------------------- 1 | """ 2 | SDSS Filters 3 | ------------ 4 | 5 | This example downloads and plots the filters from the Sloan Digital Sky 6 | Survey, along with a reference spectrum. 7 | """ 8 | import os 9 | import urllib2 10 | 11 | import numpy as np 12 | import pylab as pl 13 | from matplotlib.patches import Arrow 14 | 15 | REFSPEC_URL = 'ftp://ftp.stsci.edu/cdbs/current_calspec/1732526_nic_002.ascii' 16 | URL = 'http://www.sdss.org/dr7/instruments/imager/filters/%s.dat' 17 | 18 | def fetch_filter(filt): 19 | assert filt in 'ugriz' 20 | url = URL % filt 21 | 22 | if not os.path.exists('downloads'): 23 | os.makedirs('downloads') 24 | 25 | loc = os.path.join('downloads', '%s.dat' % filt) 26 | if not os.path.exists(loc): 27 | print "downloading from %s" % url 28 | F = urllib2.urlopen(url) 29 | open(loc, 'w').write(F.read()) 30 | 31 | F = open(loc) 32 | 33 | data = np.loadtxt(F) 34 | return data 35 | 36 | def fetch_vega_spectrum(): 37 | if not os.path.exists('downloads'): 38 | os.makedirs('downloads') 39 | 40 | refspec_file = os.path.join('downloads', REFSPEC_URL.split('/')[-1]) 41 | 42 | if not os.path.exists(refspec_file): 43 | print "downloading from %s" % REFSPEC_URL 44 | F = urllib2.urlopen(REFSPEC_URL) 45 | open(refspec_file, 'w').write(F.read()) 46 | 47 | F = open(refspec_file) 48 | 49 | data = np.loadtxt(F) 50 | return data 51 | 52 | 53 | Xref = fetch_vega_spectrum() 54 | Xref[:, 1] /= 2.1 * Xref[:, 1].max() 55 | 56 | #---------------------------------------------------------------------- 57 | # Plot filters in color with a single spectrum 58 | pl.figure() 59 | pl.plot(Xref[:, 0], Xref[:, 1], '-k', lw=2) 60 | 61 | for f,c in zip('ugriz', 'bgrmk'): 62 | X = fetch_filter(f) 63 | pl.fill(X[:, 0], X[:, 1], ec=c, fc=c, alpha=0.4) 64 | 65 | kwargs = dict(fontsize=20, ha='center', va='center', alpha=0.5) 66 | pl.text(3500, 0.02, 'u', color='b', **kwargs) 67 | pl.text(4600, 0.02, 'g', color='g', **kwargs) 68 | pl.text(6100, 0.02, 'r', color='r', **kwargs) 69 | pl.text(7500, 0.02, 'i', color='m', **kwargs) 70 | pl.text(8800, 0.02, 'z', color='k', **kwargs) 71 | 72 | pl.xlim(3000, 11000) 73 | 74 | pl.title('SDSS Filters and Reference Spectrum') 75 | pl.xlabel('Wavelength (Angstroms)') 76 | pl.ylabel('normalized flux / filter transmission') 77 | 78 | #---------------------------------------------------------------------- 79 | # Plot filters in gray with several redshifted spectra 80 | pl.figure() 81 | 82 | redshifts = [0.0, 0.4, 0.8] 83 | colors = 'bgr' 84 | 85 | for z, c in zip(redshifts, colors): 86 | pl.plot((1. + z) * Xref[:, 0], Xref[:, 1], color=c) 87 | 88 | pl.gca().add_patch(Arrow(4200, 0.47, 1300, 0, lw=0, width=0.05, color='r')) 89 | pl.gca().add_patch(Arrow(5800, 0.47, 1250, 0, lw=0, width=0.05, color='r')) 90 | 91 | pl.text(3800, 0.49, 'z = 0.0', fontsize=14, color=colors[0]) 92 | pl.text(5500, 0.49, 'z = 0.4', fontsize=14, color=colors[1]) 93 | pl.text(7300, 0.49, 'z = 0.8', fontsize=14, color=colors[2]) 94 | 95 | for f in 'ugriz': 96 | X = fetch_filter(f) 97 | pl.fill(X[:, 0], X[:, 1], ec='k', fc='k', alpha=0.2) 98 | 99 | kwargs = dict(fontsize=20, color='gray', ha='center', va='center') 100 | pl.text(3500, 0.02, 'u', **kwargs) 101 | pl.text(4600, 0.02, 'g', **kwargs) 102 | pl.text(6100, 0.02, 'r', **kwargs) 103 | pl.text(7500, 0.02, 'i', **kwargs) 104 | pl.text(8800, 0.02, 'z', **kwargs) 105 | 106 | pl.xlim(3000, 11000) 107 | pl.ylim(0, 0.55) 108 | 109 | pl.title('Redshifting of a Spectrum') 110 | pl.xlabel('Observed Wavelength (Angstroms)') 111 | pl.ylabel('normalized flux / filter transmission') 112 | 113 | pl.show() 114 | -------------------------------------------------------------------------------- /examples/plot_sdss_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | SDSS Images 3 | ----------- 4 | 5 | This script plots an example quasar, star, and galaxy image for use in 6 | the tutorial. 7 | """ 8 | import os 9 | import urllib2 10 | 11 | import pylab as pl 12 | from matplotlib import image 13 | 14 | def _fetch(outfile, RA, DEC, scale=0.2, width=400, height=400): 15 | """Fetch the image at the given RA, DEC from the SDSS server""" 16 | url = ("http://casjobs.sdss.org/ImgCutoutDR7/" 17 | "getjpeg.aspx?ra=%.8f&dec=%.8f&scale=%.2f&width=%i&height=%i" 18 | % (RA, DEC, scale, width, height)) 19 | print "downloading %s" % url 20 | print " -> %s" % outfile 21 | fhandle = urllib2.urlopen(url) 22 | open(outfile, 'w').write(fhandle.read()) 23 | 24 | 25 | def fetch_image(object_type): 26 | """Return the data array for the image of object type""" 27 | if not os.path.exists('downloads'): 28 | os.makedirs('downloads') 29 | 30 | filename = os.path.join('downloads', '%s_image.jpg' % object_type) 31 | if not os.path.exists(filename): 32 | RA = image_locations[object_type]['RA'] 33 | DEC = image_locations[object_type]['DEC'] 34 | _fetch(filename, RA, DEC) 35 | 36 | return image.imread(filename) 37 | 38 | 39 | image_locations = dict(star=dict(RA=180.63040108, 40 | DEC=64.96767375), 41 | galaxy=dict(RA=197.51943983, 42 | DEC=0.94881436), 43 | quasar=dict(RA=226.18451462, 44 | DEC=4.07456639)) 45 | 46 | 47 | # Plot the images 48 | fig = pl.figure(figsize=(9, 3)) 49 | 50 | # Check that PIL is installed for jpg support 51 | if 'jpg' not in fig.canvas.get_supported_filetypes(): 52 | raise ValueError("PIL required to load SDSS jpeg images") 53 | 54 | object_types = ['star', 'galaxy', 'quasar'] 55 | 56 | for i, object_type in enumerate(object_types): 57 | ax = pl.subplot(131 + i, xticks=[], yticks=[]) 58 | I = fetch_image(object_type) 59 | ax.imshow(I) 60 | if object_type != 'galaxy': 61 | pl.arrow(0.65, 0.65, -0.1, -0.1, width=0.005, head_width=0.03, 62 | length_includes_head=True, 63 | color='w', transform=ax.transAxes) 64 | pl.text(0.99, 0.01, object_type, fontsize='large', color='w', ha='right', 65 | transform=ax.transAxes) 66 | 67 | pl.subplots_adjust(bottom=0.04, top=0.94, left=0.02, right=0.98, wspace=0.04) 68 | 69 | pl.show() 70 | -------------------------------------------------------------------------------- /examples/plot_sdss_photoz.py: -------------------------------------------------------------------------------- 1 | """ 2 | SDSS Photometric Redshifts 3 | -------------------------- 4 | 5 | This example shows how a Decision tree can be used to learn redshifts 6 | of galaxies in the Sloan Digital Sky Survey. 7 | """ 8 | 9 | import os 10 | import urllib2 11 | import numpy as np 12 | import pylab as pl 13 | 14 | from sklearn.datasets import get_data_home 15 | from sklearn.tree import DecisionTreeRegressor 16 | 17 | DATA_URL = ('http://www.astro.washington.edu/users/' 18 | 'vanderplas/pydata/sdss_photoz.npy') 19 | LOCAL_FILE = 'sdss_photoz.npy' 20 | 21 | def fetch_photoz_data(): 22 | if not os.path.exists('downloads'): 23 | os.makedirs('downloads') 24 | 25 | local_file = os.path.join('downloads', LOCAL_FILE) 26 | 27 | if not os.path.exists(local_file): 28 | # data directory is password protected so the public can't access it 29 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() 30 | password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML') 31 | handler = urllib2.HTTPBasicAuthHandler(password_mgr) 32 | opener = urllib2.build_opener(handler) 33 | fhandle = opener.open(DATA_URL) 34 | open(local_file, 'w').write(fhandle.read()) 35 | 36 | return np.load(local_file) 37 | 38 | data = fetch_photoz_data() 39 | 40 | N = len(data) 41 | 42 | # put colors in a matrix 43 | X = np.zeros((N, 4)) 44 | X[:, 0] = data['u'] - data['g'] 45 | X[:, 1] = data['g'] - data['r'] 46 | X[:, 2] = data['r'] - data['i'] 47 | X[:, 3] = data['i'] - data['z'] 48 | z = data['redshift'] 49 | 50 | # divide into training and testing data 51 | Ntrain = 3 * N / 4 52 | Xtrain = X[:Ntrain] 53 | ztrain = z[:Ntrain] 54 | 55 | Xtest = X[Ntrain:] 56 | ztest = z[Ntrain:] 57 | 58 | 59 | clf = DecisionTreeRegressor(max_depth=20) 60 | clf.fit(Xtrain, ztrain) 61 | zpred = clf.predict(Xtest) 62 | 63 | axis_lim = np.array([-0.1, 2.5]) 64 | 65 | rms = np.sqrt(np.mean((ztest - zpred) ** 2)) 66 | print rms 67 | print len(ztest) 68 | print np.sum(abs(ztest - zpred) > 1) 69 | 70 | ax = pl.axes() 71 | pl.scatter(ztest, zpred, c='k', lw=0, s=4) 72 | pl.plot(axis_lim, axis_lim, '--k') 73 | pl.plot(axis_lim, axis_lim + rms, ':k') 74 | pl.plot(axis_lim, axis_lim - rms, ':k') 75 | pl.xlim(axis_lim) 76 | pl.ylim(axis_lim) 77 | 78 | pl.text(0.99, 0.02, "RMS error = %.2g" % rms, 79 | ha='right', va='bottom', transform=ax.transAxes, 80 | bbox=dict(ec='w', fc='w'), fontsize=16) 81 | 82 | pl.title('Photo-z: Decision Tree Regression') 83 | pl.xlabel(r'$\mathrm{z_{true}}$', fontsize=14) 84 | pl.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14) 85 | pl.show() 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /examples/plot_sdss_specPCA.py: -------------------------------------------------------------------------------- 1 | """ 2 | SDSS Spectra Plots 3 | ------------------ 4 | 5 | This plots some of the SDSS spectra examples for the astronomy tutorial 6 | """ 7 | import os 8 | import urllib2 9 | 10 | import numpy as np 11 | import pylab as pl 12 | 13 | from sklearn import preprocessing 14 | from sklearn.decomposition import RandomizedPCA 15 | 16 | DATA_URL = ('http://www.astro.washington.edu/users/' 17 | 'vanderplas/pydata/spec4000_corrected.npz') 18 | 19 | def fetch_sdss_spec_data(): 20 | if not os.path.exists('downloads'): 21 | os.makedirs('downloads') 22 | 23 | local_file = os.path.join('downloads', os.path.basename(DATA_URL)) 24 | 25 | # data directory is password protected so the public can't access it 26 | password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() 27 | password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML') 28 | handler = urllib2.HTTPBasicAuthHandler(password_mgr) 29 | opener = urllib2.build_opener(handler) 30 | 31 | # download training data 32 | if not os.path.exists(local_file): 33 | fhandle = opener.open(DATA_URL) 34 | open(local_file, 'w').write(fhandle.read()) 35 | 36 | return np.load(local_file) 37 | 38 | #---------------------------------------------------------------------- 39 | # 40 | # Load the data 41 | data = fetch_sdss_spec_data() 42 | 43 | wavelengths = data['wavelengths'] 44 | X = data['X'] 45 | y = data['y'] 46 | labels = data['labels'] 47 | 48 | from matplotlib.ticker import FuncFormatter 49 | format = FuncFormatter(lambda i, *args: labels[i].replace(' ', '\n')) 50 | 51 | #---------------------------------------------------------------------- 52 | # 53 | # Plot the first few spectra, offset so they don't overlap 54 | # 55 | pl.figure() 56 | 57 | for i_class in (2, 3, 4, 5, 6): 58 | i = np.where(y == i_class)[0][0] 59 | l = pl.plot(wavelengths, X[i] + 20 * i_class) 60 | c = l[0].get_color() 61 | pl.text(6800, 2 + 20 * i_class, labels[i_class], color=c) 62 | 63 | pl.subplots_adjust(hspace=0) 64 | pl.xlabel('wavelength (Angstroms)') 65 | pl.ylabel('flux + offset') 66 | pl.title('Sample of Spectra') 67 | 68 | #---------------------------------------------------------------------- 69 | # 70 | # Plot the mean spectrum 71 | # 72 | X = preprocessing.normalize(X, 'l2') 73 | 74 | pl.figure() 75 | 76 | mu = X.mean(0) 77 | std = X.std(0) 78 | 79 | pl.plot(wavelengths, mu, color='black') 80 | pl.fill_between(wavelengths, mu - std, mu + std, color='#CCCCCC') 81 | pl.xlim(wavelengths[0], wavelengths[-1]) 82 | pl.ylim(0, 0.06) 83 | pl.xlabel('wavelength (Angstroms)') 84 | pl.ylabel('scaled flux') 85 | pl.title('Mean Spectrum + Variance') 86 | 87 | #---------------------------------------------------------------------- 88 | # 89 | # Plot a random pair of digits 90 | # 91 | pl.figure() 92 | np.random.seed(25255) 93 | i1, i2 = np.random.randint(1000, size=2) 94 | 95 | pl.scatter(X[:, i1], X[:, i2], c=y, s=4, lw=0, 96 | vmin=2, vmax=6, cmap=pl.cm.jet) 97 | pl.colorbar(ticks = range(2, 7), format=format) 98 | pl.xlabel('wavelength = %.1f' % wavelengths[i1]) 99 | pl.ylabel('wavelength = %.1f' % wavelengths[i2]) 100 | pl.title('Random Pair of Spectra Bins') 101 | 102 | #---------------------------------------------------------------------- 103 | # 104 | # Perform PCA 105 | # 106 | 107 | rpca = RandomizedPCA(n_components=4, random_state=0) 108 | X_proj = rpca.fit_transform(X) 109 | 110 | #---------------------------------------------------------------------- 111 | # 112 | # Plot PCA components 113 | # 114 | 115 | pl.figure() 116 | pl.scatter(X_proj[:, 0], X_proj[:, 1], c=y, s=4, lw=0, 117 | vmin=2, vmax=6, cmap=pl.cm.jet) 118 | pl.colorbar(ticks = range(2, 7), format=format) 119 | pl.xlabel('coefficient 1') 120 | pl.ylabel('coefficient 2') 121 | pl.title('PCA projection of Spectra') 122 | 123 | #---------------------------------------------------------------------- 124 | # 125 | # Plot PCA eigenspectra 126 | # 127 | 128 | pl.figure() 129 | 130 | l = pl.plot(wavelengths, rpca.mean_ - 0.15) 131 | c = l[0].get_color() 132 | pl.text(7000, -0.16, "mean" % i, color=c) 133 | 134 | for i in range(4): 135 | l = pl.plot(wavelengths, rpca.components_[i] + 0.15 * i) 136 | c = l[0].get_color() 137 | pl.text(7000, -0.01 + 0.15 * i, "component %i" % (i + 1), color=c) 138 | pl.ylim(-0.2, 0.6) 139 | pl.xlabel('wavelength (Angstroms)') 140 | pl.ylabel('scaled flux + offset') 141 | pl.title('Mean Spectrum and Eigen-spectra') 142 | 143 | pl.show() 144 | -------------------------------------------------------------------------------- /examples/plot_sgd_separating_hyperplane.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================================= 3 | SGD: Maximum margin separating hyperplane 4 | ========================================= 5 | 6 | Plot the maximum margin separating hyperplane within a two-class 7 | separable dataset using a linear Support Vector Machines classifier 8 | trained using SGD. 9 | """ 10 | print __doc__ 11 | 12 | import numpy as np 13 | import pylab as pl 14 | from sklearn.linear_model import SGDClassifier 15 | from sklearn.datasets.samples_generator import make_blobs 16 | 17 | # we create 50 separable points 18 | X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) 19 | 20 | # fit the model 21 | clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) 22 | clf.fit(X, Y) 23 | 24 | # plot the line, the points, and the nearest vectors to the plane 25 | xx = np.linspace(-1, 5, 10) 26 | yy = np.linspace(-1, 5, 10) 27 | 28 | X1, X2 = np.meshgrid(xx, yy) 29 | Z = np.empty(X1.shape) 30 | for (i, j), val in np.ndenumerate(X1): 31 | x1 = val 32 | x2 = X2[i, j] 33 | p = clf.decision_function([x1, x2]) 34 | Z[i, j] = p[0] 35 | levels = [-1.0, 0.0, 1.0] 36 | linestyles = ['dashed', 'solid', 'dashed'] 37 | colors = 'k' 38 | pl.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) 39 | pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired) 40 | 41 | pl.axis('tight') 42 | pl.show() 43 | --------------------------------------------------------------------------------