├── .gitignore ├── LICENSE ├── README.md ├── abstract.rst ├── check_env.py ├── fetch_data.py ├── notebooks ├── 01.1 Introduction to Machine Learning.ipynb ├── 01.2 IPython Numpy and Matplotlib Refresher.ipynb ├── 01.3 Data Representation for Machine Learning.ipynb ├── 01.4 Training and Testing Data.ipynb ├── 02.1 Supervised Learning - Classification.ipynb ├── 02.2 Supervised Learning - Regression.ipynb ├── 02.3 Unsupervised Learning - Transformations and Dimensionality Reduction.ipynb ├── 02.4 Unsupervised Learning - Clustering.ipynb ├── 02.5 Review of Scikit-learn API.ipynb ├── 03.1 Case Study - Supervised Classification of Handwritten Digits.ipynb ├── 03.2 Methods - Unsupervised Preprocessing.ipynb ├── 03.3 Case Study - Face Recognition with Eigenfaces.ipynb ├── 03.4 Methods - Text Feature Extraction.ipynb ├── 03.5 Case Study - SMS Spam Detection.ipynb ├── 03.6 Case Study - Titanic Survival.ipynb ├── 04.1 Cross Validation.ipynb ├── 04.2 Model Complexity and GridSearchCV.ipynb ├── 04.3 Analyzing Model Capacity.ipynb ├── 04.4 Model Evaluation and Scoring Metrics.ipynb ├── 05.1 In Depth - Linear Models.ipynb ├── 05.2 In Depth - Support Vector Machines.ipynb ├── 05.3 In Depth - Trees and Forests.ipynb ├── 06.1 Pipelining Estimators.ipynb ├── 07.1 Case Study - Large Scale Text Classification.ipynb ├── datasets │ ├── smsspam │ │ ├── SMSSpamCollection │ │ └── readme │ └── titanic │ │ └── titanic3.csv ├── figures │ ├── ML_flow_chart.py │ ├── __init__.py │ ├── bag_of_words.svg │ ├── cluster_comparison.png │ ├── cross_validation.svg │ ├── data_representation.svg │ ├── feature_union.svg │ ├── grid_search_cross_validation.svg │ ├── hashing_vectorizer.svg │ ├── iris_setosa.jpg │ ├── iris_versicolor.jpg │ ├── iris_virginica.jpg │ ├── overfitting_underfitting_cartoon.svg │ ├── petal_sepal.jpg │ ├── pipeline_cross_validation.svg │ ├── plot_2d_separator.py │ ├── plot_digits_datasets.py │ ├── plot_interactive_forest.py │ ├── plot_interactive_tree.py │ ├── plot_kneighbors_regularization.py │ ├── plot_linear_svc_regularization.py │ ├── plot_rbf_svm_parameters.py │ ├── randomized_search.png │ ├── supervised_scikit_learn.png │ ├── supervised_workflow.svg │ ├── train_test_split.svg │ ├── train_validation_test2.svg │ └── unsupervised_workflow.svg ├── helpers.py ├── images │ ├── parallel_text_clf.png │ └── parallel_text_clf_average.png └── solutions │ ├── 02A_faces_plot.py │ ├── 04B_houses_regression.py │ ├── 04C_validation_exercise.py │ ├── 05B_strip_headers.py │ ├── 06B_basic_grid_search.py │ ├── 06B_learning_curves.py │ ├── 07B_grid_search.py │ ├── 08A_digits_projection.py │ └── 08B_digits_clustering.py ├── overfitting plots-checkpoint.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | notebooks/.ipynb_checkpoints/ 3 | notebooks/datasets 4 | notebooks/joblib/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SciPy 2015 Scikit-learn Tutorial 2 | ================================ 3 | 4 | You can find the video recordings on youtube: 5 | 6 | - [Part 1](https://www.youtube.com/watch?v=80fZrVMurPM) 7 | - [Part 2](https://www.youtube.com/watch?v=Ud-FsEWegmA) 8 | 9 | 10 | Based on the SciPy [2013 tutorial](https://github.com/jakevdp/sklearn_scipy2013) by [Gael Varoquaux](http://gael-varoquaux.info), [Olivier Grisel](http://ogrisel.com) and [Jake VanderPlas](http://jakevdp.github.com 11 | ). 12 | 13 | 14 | Instructors 15 | ----------- 16 | - [Kyle Kastner](https://kastnerkyle.github.io/) [@kastnerkyle](https://twitter.com/kastnerkyle)- Université de Montréal 17 | - [Andreas Mueller](http://amuller.github.io) [@t3kcit](https://twitter.com/t3kcit) - NYU Center for Data Science 18 | 19 | 20 | This repository will contain files and other info associated with our Scipy 21 | 2015 scikit-learn tutorial. 22 | 23 | Parts 1 to 5 make up the morning session, while 24 | parts 6 to 9 will be presented in the afternoon. 25 | 26 | Installation Notes 27 | ------------------ 28 | 29 | This tutorial will require recent installations of *numpy*, *scipy*, 30 | *matplotlib*, *scikit-learn* and *ipython* with ipython 31 | notebook. 32 | 33 | The last one is important, you should be able to type: 34 | 35 | ipython notebook 36 | 37 | in your terminal window and see the notebook panel load in your web browser. 38 | Try opening and running a notebook from the material to see check that it works. 39 | 40 | For users who do not yet have these packages installed, a relatively 41 | painless way to install all the requirements is to use a package such as 42 | [Anaconda CE](http://store.continuum.io/ "Anaconda CE"), which can be 43 | downloaded and installed for free. 44 | Python2.7 and 3.4 should both work fine for this tutorial. 45 | 46 | After getting the material, you should run ``python check_env.py`` to verify 47 | your environment. 48 | 49 | Downloading the Tutorial Materials 50 | ---------------------------------- 51 | I would highly recommend using git, not only for this tutorial, but for the 52 | general betterment of your life. Once git is installed, you can clone the 53 | material in this tutorial by using the git address shown above: 54 | 55 | git clone git://github.com/amueller/scipy_2015_sklearn_tutorial.git 56 | 57 | If you can't or don't want to install git, there is a link above to download 58 | the contents of this repository as a zip file. We may make minor changes to 59 | the repository in the days before the tutorial, however, so cloning the 60 | repository is a much better option. 61 | 62 | Data Downloads 63 | -------------- 64 | The data for this tutorial is not included in the repository. We will be 65 | using several data sets during the tutorial: most are built-in to 66 | scikit-learn, which 67 | includes code which automatically downloads and caches these 68 | data. Because the wireless network 69 | at conferences can often be spotty, it would be a good idea to download these 70 | data sets before arriving at the conference. 71 | Run ``fetch_data.py`` to download all necessary data beforehand. 72 | 73 | Outline 74 | ======= 75 | 76 | Morning Session 77 | ---------------- 78 | - What is machine learning? (Sample applications) 79 | - Kinds of machine learning: unsupervised vs supervised. 80 | - Data formats and preparation. 81 | - Supervised learning 82 | - Interface 83 | - Training and test data 84 | - Classification 85 | - Regression 86 | - Unsupervised Learning 87 | - Unsupervised transformers 88 | - Preprocessing and scaling 89 | - Dimensionality reduction 90 | - Clustering 91 | - Summary : Estimator interface 92 | - Application : Classification of digits 93 | - Application : Eigenfaces 94 | - Methods: Text feature abstraction, bag of words 95 | - Application : SMS spam detection 96 | - Summary : Model building and generalization 97 | 98 | 99 | Afternoon Session 100 | ------------------ 101 | - Cross-Validation 102 | - Model Complexity: Overfitting and underfitting 103 | - Complexity of various model types 104 | - Grid search for adjusting hyperparameters 105 | - Basic regression with cross-validation 106 | - Application : Titanic survival with Random Forest 107 | - Building Pipelines 108 | - Motivation and Basics 109 | - Preprocessing and Classification 110 | - Grid-searching Parameters of the feature extraction 111 | - Application : Image classification 112 | - Model complexity, learning curves and validation curves 113 | - In-Depth supervised models 114 | - Linear Models 115 | - Kernel SVMs 116 | - trees and Forests 117 | - Learning with Big Data 118 | - Out-Of-Core learning 119 | - The hashing trick for large text corpuses 120 | -------------------------------------------------------------------------------- /abstract.rst: -------------------------------------------------------------------------------- 1 | Tutorial Topic 2 | -------------- 3 | 4 | This tutorial aims to provide an introduction to machine learning and 5 | scikit-learn "from the ground up". We will start with core concepts of machine 6 | learning, some example uses of machine learning, and how to implement them 7 | using scikit-learn. Going in detail through the characteristics of several 8 | methods, we will discuss how to pick an algorithm for your application, how to 9 | set its parameters, and how to evaluate performance. 10 | 11 | Please provide a more detailed abstract of your tutorial (again, see last years tutorials). 12 | --------------------------------------------------------------------------------------------- 13 | 14 | Machine learning is the task of extracting knowledge from data, often with the 15 | goal of generalizing to new and unseen data. Applications of machine learning 16 | now touch nearly every aspect of everyday life, from the face detection in our 17 | phones and the streams of social media we consume to picking restaurants, 18 | partners, and movies. It has also become indispensable to many empirical 19 | sciences, including physics, astronomy, biology, and the social sciences. 20 | 21 | Scikit-learn has emerged as one of the most popular toolkits for machine 22 | learning, and is now widely used in industry and academia. 23 | The goal of this tutorial is to enable participants to use the wide variety of 24 | machine learning algorithms available in scikit-learn on their own data sets, 25 | for their own domains. 26 | 27 | This tutorial will comprise an introductory morning session and an advanced 28 | afternoon session. The morning part of the tutorial will cover basic concepts 29 | of machine learning, data representation, and preprocessing. We will explain 30 | different problem settings and which algorithms to use in each situation. 31 | We will then go through some sample applications using algorithms implemented 32 | in scikit-learn, including SVMs, Random Forests, K-Means, PCA, t-SNE, and 33 | others. 34 | 35 | In the afternoon session, we will discuss setting hyper-parameters and how to 36 | prevent overfitting. We will go in-depth into the trade-off of model complexity 37 | and dataset size, as well as discussing complexity of learning algorithms and 38 | how to cope with very large datasets. The session will conclude by stepping 39 | through the process of building machine learning pipelines consisting of 40 | feature extraction, preprocessing and supervised learning. 41 | 42 | 43 | Outline 44 | ======== 45 | 46 | Morning Session 47 | ---------------- 48 | - What is machine learning? (Sample applications) 49 | - Kinds of machine learning: unsupervised vs supervised. 50 | - Data formats and preparation. 51 | 52 | - Supervised learning: Interface 53 | - Supervised learning: Training and test data 54 | - Supervised learning: Classification 55 | - Supervised learning: Regression 56 | - Unsupervised Learning: Unsupervised transformers 57 | - Unsupervised Learning: Preprocessing and scaling 58 | - Unsupervised Learning: Dimensionality reduction 59 | - Unsupervised Learning: Clustering 60 | - Summary : Estimator interface 61 | 62 | - Application: Classification of digits 63 | - Methods: Unsupervised learning 64 | - Application : Eigenfaces 65 | - Methods: Text feature abstraction, bag of words 66 | - Application : Insult detection 67 | - Summary : Model building and generalization 68 | 69 | Afternoon Session 70 | ------------------ 71 | - Cross-Validation 72 | - Model Complexity: Overfitting and underfitting 73 | - Complexity of various model types 74 | - Grid search for adjusting hyperparameters 75 | 76 | - Basic regression with cross-validation 77 | - Application : Titanic survival with Random Forest 78 | 79 | - Building Pipelines: Motivation and Basics 80 | - Building Pipelines: Preprocessing and Classification 81 | - Building Pipelines: Grid-searching Parameters of the feature extraction 82 | - Application : Image classification 83 | 84 | - Model complexity, learning curves and validation curves 85 | - In-Depth: Linear Models 86 | - In-Depth: Kernel SVMs 87 | - In-Depth: trees and Forests 88 | 89 | - Learning with Big Data: Out-Of-Core learning 90 | - Learning with Big Data: The hashing trick for large text corpuses 91 | -------------------------------------------------------------------------------- /check_env.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | try: 3 | import curses 4 | curses.setupterm() 5 | assert curses.tigetnum("colors") > 2 6 | OK = "\x1b[1;%dm[ OK ]\x1b[0m" % (30 + curses.COLOR_GREEN) 7 | FAIL = "\x1b[1;%dm[FAIL]\x1b[0m" % (30 + curses.COLOR_RED) 8 | except: 9 | OK = '[ OK ]' 10 | FAIL = '[FAIL]' 11 | 12 | 13 | import sys 14 | try: 15 | import importlib 16 | except ImportError: 17 | print(FAIL, "Python version 3.4 (or 2.7) is required, but %s is installed." % sys.version) 18 | from distutils.version import LooseVersion as Version 19 | 20 | 21 | def import_version(pkg, min_ver, fail_msg=""): 22 | mod = None 23 | try: 24 | mod = importlib.import_module(pkg) 25 | # workaround for Image not having __version__ 26 | version = getattr(mod, "__version__", 0) or getattr(mod, "VERSION", 0) 27 | if Version(version) < min_ver: 28 | print(FAIL, "%s version %s or higher required, but %s installed." 29 | % (lib, min_ver, version)) 30 | else: 31 | print(OK, '%s version %s' % (pkg, version)) 32 | except ImportError: 33 | print(FAIL, '%s not installed. %s' % (pkg, fail_msg)) 34 | return mod 35 | 36 | 37 | # first check the python version 38 | print('Using python in', sys.prefix) 39 | print(sys.version) 40 | pyversion = Version(sys.version) 41 | if pyversion >= "3": 42 | if pyversion < "3.4": 43 | print(FAIL, "Python version 3.4 (or 2.7) is required, but %s is installed." % sys.version) 44 | elif pyversion >= "2": 45 | if pyversion < "2.7": 46 | print(FAIL, "Python version 2.7 is required, but %s is installed." % sys.version) 47 | else: 48 | print(FAIL, "Unknown Python version: %s" % sys.version) 49 | 50 | print() 51 | requirements = {'numpy': "1.6.1", 'scipy': "0.9", 'matplotlib': "1.0", 52 | 'IPython': "3.0", 'sklearn': "0.15"} 53 | 54 | # now the dependencies 55 | for lib, required_version in list(requirements.items()): 56 | import_version(lib, required_version) 57 | 58 | # pydot is a bit different 59 | import_version("pydot", "0", fail_msg="pydot is not installed. It is not required " 60 | "but you will miss out on some plots. \nYou can install it using " 61 | "'pip install pydot' on python2, and 'pip install " 62 | "git+https://github.com/nlhepler/pydot.git' on python3.") 63 | 64 | import_version("Image", "0", fail_msg="The Image module is not installed." 65 | " Please install the Pillow package, which provides it.") 66 | -------------------------------------------------------------------------------- /fetch_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | try: 3 | from urllib.request import urlopen 4 | except ImportError: 5 | from urllib import urlopen 6 | 7 | import zipfile 8 | 9 | 10 | SENTIMENT140_URL = ("http://cs.stanford.edu/people/alecmgo/" 11 | "trainingandtestdata.zip") 12 | SENTIMENT140_ARCHIVE_NAME = "trainingandtestdata.zip" 13 | 14 | 15 | def get_datasets_folder(): 16 | here = os.path.dirname(__file__) 17 | notebooks = os.path.join(here, 'notebooks') 18 | datasets_folder = os.path.abspath(os.path.join(notebooks, 'datasets')) 19 | datasets_archive = os.path.abspath(os.path.join(notebooks, 'datasets.zip')) 20 | 21 | if not os.path.exists(datasets_folder): 22 | if os.path.exists(datasets_archive): 23 | print("Extracting " + datasets_archive) 24 | zf = zipfile.ZipFile(datasets_archive) 25 | zf.extractall('.') 26 | assert os.path.exists(datasets_folder) 27 | else: 28 | print("Creating datasets folder: " + datasets_folder) 29 | os.makedirs(datasets_folder) 30 | else: 31 | print("Using existing dataset folder:" + datasets_folder) 32 | return datasets_folder 33 | 34 | 35 | def check_sentiment140(datasets_folder): 36 | print("Checking availability of the sentiment 140 dataset") 37 | archive_path = os.path.join(datasets_folder, SENTIMENT140_ARCHIVE_NAME) 38 | sentiment140_path = os.path.join(datasets_folder, 'sentiment140') 39 | train_path = os.path.join(sentiment140_path, 40 | 'training.1600000.processed.noemoticon.csv') 41 | test_path = os.path.join(sentiment140_path, 42 | 'testdata.manual.2009.06.14.csv') 43 | 44 | if not os.path.exists(sentiment140_path): 45 | if not os.path.exists(archive_path): 46 | print("Downloading dataset from %s (77MB)" % SENTIMENT140_URL) 47 | opener = urlopen(SENTIMENT140_URL) 48 | open(archive_path, 'wb').write(opener.read()) 49 | else: 50 | print("Found archive: " + archive_path) 51 | 52 | print("Extracting %s to %s" % (archive_path, sentiment140_path)) 53 | zf = zipfile.ZipFile(archive_path) 54 | zf.extractall(sentiment140_path) 55 | print("Checking that the sentiment 140 CSV files exist...") 56 | assert os.path.exists(train_path) 57 | assert os.path.exists(test_path) 58 | print("=> Success!") 59 | 60 | 61 | if __name__ == "__main__": 62 | datasets_folder = get_datasets_folder() 63 | check_sentiment140(datasets_folder) 64 | 65 | print("Loading Labeled Faces Data (~200MB)") 66 | from sklearn.datasets import fetch_lfw_people 67 | fetch_lfw_people(min_faces_per_person=70, resize=0.4, 68 | data_home=datasets_folder) 69 | print("=> Success!") 70 | -------------------------------------------------------------------------------- /notebooks/01.1 Introduction to Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Machine Learning in Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## What is Machine Learning?" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Machine learning is the process to automatically extract knowledge from data, usually with the goal of making predictions on new, unseen data. A classical example is a spam filter, for which the user keeps labeling incoming mails as either spam or not spam. A machine learning algorithm then \"learns\" what distinguishes spam from normal emails, and can predict for new emails whether they are spam or not.\n", 22 | "\n", 23 | "Central to machine learning is the concept of **making decision automatically** from data, **without the user specifying explicit rules** how this decision should be made.\n", 24 | "\n", 25 | "For the case of emails, the user doesn't provide a list of words or characteristics that make an email spam. Instead, the user provides examples of spam and non-spam emails.\n", 26 | "\n", 27 | "The second central concept is **generalization**. The goal of a machine learning algorithm is to predict on new, previously unseen data. We are not interested in marking an email as spam or not, that the human already labeled. Instead, we want to make the users life easier by making an automatic decision for new incoming mail." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "The data is presented to the algorithm usually as an array of numbers. Each data point (also known as sample) that we want to either learn from or make a decision on is represented as a list of numbers, called features, that reflect properties of this point." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "There are two kinds of machine learning we will talk about today: Supervised learning and unsupervised learning" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Supervised Learning: Classification and regression\n", 56 | "\n", 57 | "In **Supervised Learning**, we have a dataset consisting of both input features and a desired output, such as in the spam / no-spam example.\n", 58 | "The task is to construct a model (or program) which is able to predict the desired output of an unseen object\n", 59 | "given the set of features.\n", 60 | "\n", 61 | "Some more complicated examples are:\n", 62 | "\n", 63 | "- given a multicolor image of an object through a telescope, determine\n", 64 | " whether that object is a star, a quasar, or a galaxy.\n", 65 | "- given a photograph of a person, identify the person in the photo.\n", 66 | "- given a list of movies a person has watched and their personal rating\n", 67 | " of the movie, recommend a list of movies they would like.\n", 68 | "- given a persons age, education and position, infer their salary\n", 69 | "\n", 70 | "What these tasks have in common is that there is one or more unknown\n", 71 | "quantities associated with the object which needs to be determined from other\n", 72 | "observed quantities.\n", 73 | "\n", 74 | "Supervised learning is further broken down into two categories, **classification** and **regression**.\n", 75 | "In classification, the label is discrete, such as \"spam\" or \"no spam\". In other words, it provides a clear-cut distinction between categories. In regression, the label is continuous, that is a float output. For example,\n", 76 | "in astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a\n", 77 | "classification problem: the label is from three distinct categories. On the other hand, we might\n", 78 | "wish to estimate the age of an object based on such observations: this would be a regression problem,\n", 79 | "because the label (age) is a continuous quantity.\n", 80 | "\n", 81 | "In supervised learning, there is always a distinction between a **training set** for which the desired outcome is given, and a **test set** for which the desired outcome needs to be inferred.\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "### Unsupervised Learning\n", 89 | "\n", 90 | "In **Unsupervised Learning** there is no desired output associated with the data.\n", 91 | "Instead, we are interested in extracting some form of knowledge or model from the given data.\n", 92 | "In a sense, you can think of unsupervised learning as a means of discovering labels from the data itself.\n", 93 | "Unsupervised learning is often harder to understand and to evaluate.\n", 94 | "\n", 95 | "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n", 96 | "*density estimation*. For example, in the iris data discussed above, we can used unsupervised\n", 97 | "methods to determine combinations of the measurements which best display the structure of the\n", 98 | "data. As we’ll see below, such a projection of the data can be used to visualize the\n", 99 | "four-dimensional dataset in two dimensions. Some more involved unsupervised learning problems are:\n", 100 | "\n", 101 | "- given detailed observations of distant galaxies, determine which features or combinations of\n", 102 | " features summarize best the information.\n", 103 | "- given a mixture of two sound sources (for example, a person talking over some music),\n", 104 | " separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n", 105 | "- given a video, isolate a moving object and categorize in relation to other moving objects which have been seen.\n", 106 | "- given a large collection of news articles, find recurring topics inside these articles.\n", 107 | "- given a collection of images, cluster similar images together (for example to group them when visualizing a collection)\n", 108 | "\n", 109 | "Sometimes the two may even be combined: e.g. Unsupervised learning can be used to find useful\n", 110 | "features in heterogeneous data, and then these features can be used within a supervised\n", 111 | "framework." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 2", 127 | "language": "python", 128 | "name": "python2" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 2 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython2", 140 | "version": "2.7.6" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 0 145 | } 146 | -------------------------------------------------------------------------------- /notebooks/01.4 Training and Testing Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Training and Testing Data\n", 8 | "=====================================\n", 9 | "To evaluate how well our supervised models generalize, we can split our data into a training and a test set:\n", 10 | "\n", 11 | "\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 15, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "from sklearn.datasets import load_iris\n", 23 | "from sklearn.neighbors import KNeighborsClassifier\n", 24 | "\n", 25 | "iris = load_iris()\n", 26 | "X, y = iris.data, iris.target\n", 27 | "\n", 28 | "classifier = KNeighborsClassifier()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Thinking about how machine learning is normally performed, the idea of a train/test split makes sense. Real world systems train on the data they have, and as other data comes in (from customers, sensors, or other sources) the classifier that was trained must predict on fundamentally *new* data. We can simulate this during training using a train/test split - the test data is a simulation of \"future data\" which will come into the system during production. \n", 36 | "\n", 37 | "Specifically for iris, the labels in iris are sorted, which means that if we split the data using a proportional split, we will get all of specific labels (0 and 1) and very little of another (2). We want to split as illustrated above, but *after* the data has been randomly shuffled." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "y" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "To get an accurate simulation of the real world, we will shuffle our data then split." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "import numpy as np\n", 67 | "rng = np.random.RandomState(0)\n", 68 | "\n", 69 | "permutation = rng.permutation(len(X))\n", 70 | "X, y = X[permutation], y[permutation]\n", 71 | "print(y)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Now we need to split the data into training and testing. Luckily, this is a common pattern in machine learning and scikit-learn has a prebuilt function to split data into training and testing for you. Here we use 50% of the data as training, and 50% testing. 80% and 20% is another common split, but there are no hard and fast rules. The most important thing is to fairly evaluate your system on data it *has not* seen during training!" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "from sklearn.cross_validation import train_test_split\n", 90 | "train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.5, random_state=1999)\n", 91 | "print(\"Labels for training and testing data\")\n", 92 | "print(train_y)\n", 93 | "print(test_y)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "By evaluating our classifier performance on data that has been seen during training, we could get false confidence in the power of our system. This might lead to putting a system into production which *fails* at predicting new data! It is much better to use a train/test split in order to properly see how your trained model is doing on new data." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "classifier.fit(train_X, train_y)\n", 112 | "pred_y = classifier.predict(test_X)\n", 113 | "print(\"Fraction Correct\")\n", 114 | "print(np.sum(pred_y == test_y) / float(len(test_y)))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "We can also visualize the correct and failed predictions" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "%matplotlib inline\n", 133 | "import matplotlib.pyplot as plt\n", 134 | "import numpy as np\n", 135 | "correct_idx = np.where(pred_y == test_y)[0]\n", 136 | "print(correct_idx)\n", 137 | "incorrect_idx = np.where(pred_y != test_y)[0]\n", 138 | "print(incorrect_idx)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "# Plot two dimensions\n", 150 | "colors = [\"darkblue\", \"darkgreen\", \"gray\"]\n", 151 | "for n, color in enumerate(colors):\n", 152 | " idx = np.where(test_y == n)[0]\n", 153 | " plt.scatter(test_X[idx, 0], test_X[idx, 1], color=color, label=\"Class %s\" % str(n))\n", 154 | "plt.scatter(test_X[incorrect_idx, 0], test_X[incorrect_idx, 1], color=\"darkred\")\n", 155 | "# Make xlim larger to accommodate legend\n", 156 | "plt.xlim(3, 9)\n", 157 | "plt.legend(loc=3)\n", 158 | "plt.title(\"Iris Classification results\")\n", 159 | "plt.show()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "We can see that the errors occur in the area where green (class 1) and gray (class 2) overlap. This gives us insight about what features to add - any feature which helps separate class 1 and class 2 should improve classifier performance." 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 2", 173 | "language": "python", 174 | "name": "python2" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 2 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython2", 186 | "version": "2.7.6" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 0 191 | } 192 | -------------------------------------------------------------------------------- /notebooks/02.1 Supervised Learning - Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "To visualize the workings of machine learning algorithms, it is often helpful to study two-dimensional or one-dimensional data, that is data with only one or two features. While in practice, datasets usually have many more features, it is hard to plot high-dimensional data on two-dimensional screens.\n", 21 | "\n", 22 | "We will illustrate some very simple examples before we move on to more \"real world\" data sets." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Classification\n", 30 | "========\n", 31 | "First, we will look at a two class classification problem in two dimensions. We use the synthetic data generated by the ``make_blobs`` function." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "from sklearn.datasets import make_blobs\n", 43 | "X, y = make_blobs(centers=2, random_state=0)\n", 44 | "print(X.shape)\n", 45 | "print(y.shape)\n", 46 | "print(X[:5, :])\n", 47 | "print(y[:5])" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "As the data is two-dimensional, we can plot each sample as a point in two-dimensional space, with the first feature being the x-axis and the second feature being the y-axis." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "plt.scatter(X[:, 0], X[:, 1], c=y, s=40)\n", 66 | "plt.xlabel(\"first feature\")\n", 67 | "plt.ylabel(\"second feature\")" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "As classification is a supervised task, and we are interested in how well the model generalizes, we split our data into a training set,\n", 75 | "to built the model from, and a test-set, to evaluate how well our model performs on new data. The ``train_test_split`` function form the ``cross_validation`` module does that for us, by randomly splitting of 25% of the data for testing.\n", 76 | "\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "from sklearn.cross_validation import train_test_split\n", 88 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### The scikit-learn estimator API\n", 96 | "\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Every algorithm is exposed in scikit-learn via an ''Estimator'' object. For instance a logistic regression is:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "from sklearn.linear_model import LogisticRegression" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "All models in scikit-learn have a very consistent interface.\n", 122 | "First, we instantiate the estimator object." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "classifier = LogisticRegression()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "X_train.shape" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "y_train.shape" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "To built the model from our data, that is to learn how to classify new points, we call the ``fit`` function with the training data, and the corresponding training labels (the desired output for the training data point):" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "classifier.fit(X_train, y_train)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "We can then apply the model to unseen data and use the model to predict the estimated outcome using the ``predict`` method:" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "prediction = classifier.predict(X_test)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "We can compare these against the true labels:" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "print(prediction)\n", 210 | "print(y_test)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "We can evaluate our classifier quantitatively by measuring what fraction of predictions is correct. This is called **accuracy**:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "np.mean(prediction == y_test)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "There is also a convenience function , ``score``, that all scikit-learn classifiers have to compute this directly from the test data:\n", 236 | " " 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": false 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "classifier.score(X_test, y_test)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "It is often helpful to compare the generalization performance (on the test set) to the performance on the training set:" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "classifier.score(X_train, y_train)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "LogisticRegression is a so-called linear model,\n", 273 | "that means it will create a decision that is linear in the input space. In 2d, this simply means it finds a line to separate the blue from the red:" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "from figures import plot_2d_separator\n", 285 | "\n", 286 | "plt.scatter(X[:, 0], X[:, 1], c=y, s=40)\n", 287 | "plt.xlabel(\"first feature\")\n", 288 | "plt.ylabel(\"second feature\")\n", 289 | "plot_2d_separator(classifier, X)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "**Estimated parameters**: All the estimated parameters are attributes of the estimator object ending by an underscore. Here, these are the coefficients and the offset of the line:" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "print(classifier.coef_)\n", 308 | "print(classifier.intercept_)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "Another classifier: K Nearest Neighbors\n", 316 | "------------------------------------------------\n", 317 | "Another popular and easy to understand classifier is K nearest neighbors (kNN). It has one of the simplest learning strategies: given a new, unknown observation, look up in your reference database which ones have the closest features and assign the predominant class.\n", 318 | "\n", 319 | "The interface is exactly the same as for ``LogisticRegression above``." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": false 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "from sklearn.neighbors import KNeighborsClassifier" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "This time we set a parameter of the KNeighborsClassifier to tell it we only want to look at one nearest neighbor:" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "knn = KNeighborsClassifier(n_neighbors=1)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "We fit the model with out training data" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": false 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "knn.fit(X_train, y_train)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "plt.scatter(X[:, 0], X[:, 1], c=y, s=40)\n", 378 | "plt.xlabel(\"first feature\")\n", 379 | "plt.ylabel(\"second feature\")\n", 380 | "plot_2d_separator(knn, X)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": { 387 | "collapsed": false 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "knn.score(X_test, y_test)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "Exercise\n", 399 | "=========\n", 400 | "Apply the KNeighborsClassifier to the ``iris`` dataset. Play with different values of the ``n_neighbors`` and observe how training and test score change." 401 | ] 402 | } 403 | ], 404 | "metadata": { 405 | "kernelspec": { 406 | "display_name": "Python 2", 407 | "language": "python", 408 | "name": "python2" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 2 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython2", 420 | "version": "2.7.6" 421 | } 422 | }, 423 | "nbformat": 4, 424 | "nbformat_minor": 0 425 | } 426 | -------------------------------------------------------------------------------- /notebooks/02.2 Supervised Learning - Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Regression\n", 21 | "==========\n", 22 | "In regression we try to predict a continuous output variable. This can be most easily visualized in one dimension.\n", 23 | "We will start with a very simple toy example. We will create a dataset out of a sinus curve with some noise:" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "x = np.linspace(-3, 3, 100)\n", 35 | "print(x)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "rng = np.random.RandomState(42)\n", 47 | "y = np.sin(4 * x) + x + rng.uniform(size=len(x))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "plt.plot(x, y, 'o')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Linear Regression\n", 66 | "=================\n", 67 | "One of the simplest models again is a linear one, that simply tries to predict the data as lying on a line. One way to find such a line is LinearRegression (also known as ordinary least squares).\n", 68 | "The interface for LinearRegression is exactly the same as for the classifiers before, only that ``y`` now contains float values, instead of classes." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "To apply a scikit-learn model, we need to make X be a 2d-array:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "print(x.shape)\n", 87 | "X = x[:, np.newaxis]\n", 88 | "print(X.shape)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "We split our data in a training and a test set again:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "from sklearn.cross_validation import train_test_split\n", 107 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Then we can built our regression model:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from sklearn.linear_model import LinearRegression\n", 126 | "regressor = LinearRegression()\n", 127 | "regressor.fit(X_train, y_train)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "And predict. First let us try the training set:" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "y_pred_train = regressor.predict(X_train)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "plt.plot(X_train, y_train, 'o', label=\"data\")\n", 157 | "plt.plot(X_train, y_pred_train, 'o', label=\"prediction\")\n", 158 | "plt.legend(loc='best')" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "The line is able to capture the general slope of the data, but not many details." 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Let's try the test set:" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "y_pred_test = regressor.predict(X_test)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "plt.plot(X_test, y_test, 'o', label=\"data\")\n", 195 | "plt.plot(X_test, y_pred_test, 'o', label=\"prediction\")\n", 196 | "plt.legend(loc='best')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "Again, scikit-learn provides an easy way to evaluate the prediction quantitatively using the ``score`` method. For regression tasks, this is the R2 score. Another popular way would be the mean squared error." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "regressor.score(X_test, y_test)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "KNeighborsRegression\n", 222 | "=======================\n", 223 | "As for classification, we can also use a neighbor based method for regression. We can simply take the output of the nearest point, or we could average several nearest points. This method is less popular for regression than for classification, but still a good baseline." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "from sklearn.neighbors import KNeighborsRegressor\n", 235 | "kneighbor_regression = KNeighborsRegressor(n_neighbors=1)\n", 236 | "kneighbor_regression.fit(X_train, y_train)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Again, let us look at the behavior on training and test set:" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "y_pred_train = kneighbor_regression.predict(X_train)\n", 255 | "\n", 256 | "plt.plot(X_train, y_train, 'o', label=\"data\")\n", 257 | "plt.plot(X_train, y_pred_train, 'o', label=\"prediction\")\n", 258 | "plt.legend(loc='best')" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "On the training set, we do a perfect job: each point is its own nearest neighbor!" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "y_pred_test = kneighbor_regression.predict(X_test)\n", 277 | "\n", 278 | "plt.plot(X_test, y_test, 'o', label=\"data\")\n", 279 | "plt.plot(X_test, y_pred_test, 'o', label=\"prediction\")\n", 280 | "plt.legend(loc='best')" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "On the test set, we also do a better job of capturing the variation, but our estimates look much more messy then before.\n", 288 | "Let us look at the R2 score:" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "kneighbor_regression.score(X_test, y_test)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Much better then before! Here, the linear model was not a good fit for our problem." 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "Exercise\n", 314 | "=========\n", 315 | "Compare the KNeighborsRegressor and LinearRegression on the boston housing dataset. You can load the dataset using ``sklearn.datasets.load_boston``." 316 | ] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 2", 322 | "language": "python", 323 | "name": "python2" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 2 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython2", 335 | "version": "2.7.6" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 0 340 | } 341 | -------------------------------------------------------------------------------- /notebooks/02.3 Unsupervised Learning - Transformations and Dimensionality Reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Unsupervised Learning\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Many instances of unsupervised learning, such as dimensionality reduction, manifold learning and feature extraction, find a new representation of the input data without any additional input.\n", 28 | "\n", 29 | "\n", 30 | "\n", 31 | "The most simple example of this, which can barely be called learning, is rescaling the data to have zero mean and unit variance. This is a helpful preprocessing step for many machine learning models.\n", 32 | "\n", 33 | "Applying such a preprocessing has a very similar interface to the supervised learning algorithms we saw so far.\n", 34 | "Let's load the iris dataset and rescale it:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from sklearn.datasets import load_iris\n", 46 | "\n", 47 | "iris = load_iris()\n", 48 | "X, y = iris.data, iris.target\n", 49 | "print(X.shape)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "The iris dataset is not \"centered\" that is it has non-zero mean and the standard deviation is different for each component:\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "print(\"mean : %s \" % X.mean(axis=0))\n", 68 | "print(\"standard deviation : %s \" % X.std(axis=0))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "To use a preprocessing method, we first import the estimator, here StandardScaler and instantiate it:\n", 76 | " " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "from sklearn.preprocessing import StandardScaler\n", 88 | "scaler = StandardScaler()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "As with the classification and regression algorithms, we call ``fit`` to learn the model from the data. As this is an unsupervised model, we only pass ``X``, not ``y``. This simply estimates mean and standard deviation." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "scaler.fit(X)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Now we can rescale our data by applying the ``transform`` (not ``predict``) method:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "X_scaled = scaler.transform(X)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "``X_scaled`` has the same number of samples and features, but the mean was subtracted and all features were scaled to have unit standard deviation:" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "print(X_scaled.shape)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "print(\"mean : %s \" % X_scaled.mean(axis=0))\n", 154 | "print(\"standard deviation : %s \" % X_scaled.std(axis=0))" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "Principal Component Analysis\n", 162 | "============================" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "An unsupervised transformation that is somewhat more interesting is Principle Component Analysis (PCA).\n", 170 | "It is a technique to reduce the dimensionality of the data, by creating a linear projection.\n", 171 | "That is, we find new features to represent the data that are a linear combination of the old data (i.e. we rotate it).\n", 172 | "\n", 173 | "The way PCA finds these new directions is by looking for the directions of maximum variance.\n", 174 | "Usually only few components that explain most of the variance in the data are kept. To illustrate how a rotation might look like, we first show it on two dimensional data and keep both principal components.\n", 175 | "\n", 176 | "We create a Gaussian blob that is rotated:" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "rnd = np.random.RandomState(5)\n", 188 | "X_ = rnd.normal(size=(300, 2))\n", 189 | "X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)\n", 190 | "y = X_[:, 0] > 0\n", 191 | "plt.scatter(X_blob[:, 0], X_blob[:, 1], c=y, linewidths=0, s=30)\n", 192 | "plt.xlabel(\"feature 1\")\n", 193 | "plt.ylabel(\"feature 2\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "As always, we instantiate our PCA model. By default all directions are kept." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "from sklearn.decomposition import PCA\n", 212 | "pca = PCA()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Then we fit the PCA model with our data. As PCA is an unsupervised algorithm, there is no output ``y``." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "pca.fit(X_blob)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "Then we can transform the data, projected on the principal components:" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "X_pca = pca.transform(X_blob)\n", 249 | "\n", 250 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, linewidths=0, s=30)\n", 251 | "plt.xlabel(\"first principal component\")\n", 252 | "plt.ylabel(\"second principal component\")" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "On the left of the plot you can see the four points that were on the top right before. PCA found fit first component to be along the diagonal, and the second to be perpendicular to it. As PCA finds a rotation, the principal components are always at right angles to each other." 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "Dimensionality Reduction for Visualization with PCA\n", 267 | "-------------------------------------------------------------\n", 268 | "Consider the digits dataset. It cannot be visualized in a single 2D plot, as it has 64 features. We are going to extract 2 dimensions to visualize it in, using the example from the sklearn examples [here](http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "from figures.plot_digits_datasets import digits_plot\n", 280 | "\n", 281 | "digits_plot()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "Note that this projection was determined *without* any information about the\n", 289 | "labels (represented by the colors): this is the sense in which the learning\n", 290 | "is **unsupervised**. Nevertheless, we see that the projection gives us insight\n", 291 | "into the distribution of the different digits in parameter space." 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Manifold Learning\n", 299 | "\n", 300 | "One weakness of PCA is that it cannot detect non-linear features. A set\n", 301 | "of algorithms known as *Manifold Learning* have been developed to address\n", 302 | "this deficiency. A canonical dataset used in Manifold learning is the\n", 303 | "*S-curve*, which we briefly saw in an earlier section:" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "from sklearn.datasets import make_s_curve\n", 315 | "X, y = make_s_curve(n_samples=1000)\n", 316 | "\n", 317 | "from mpl_toolkits.mplot3d import Axes3D\n", 318 | "ax = plt.axes(projection='3d')\n", 319 | "\n", 320 | "ax.scatter3D(X[:, 0], X[:, 1], X[:, 2], c=y)\n", 321 | "ax.view_init(10, -60)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "This is a 2-dimensional dataset embedded in three dimensions, but it is embedded\n", 329 | "in such a way that PCA cannot discover the underlying data orientation:" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "X_pca = PCA(n_components=2).fit_transform(X)\n", 341 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "Manifold learning algorithms, however, available in the ``sklearn.manifold``\n", 349 | "submodule, are able to recover the underlying 2-dimensional manifold:" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": { 356 | "collapsed": false 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "from sklearn.manifold import Isomap\n", 361 | "\n", 362 | "iso = Isomap(n_neighbors=15, n_components=2)\n", 363 | "X_iso = iso.fit_transform(X)\n", 364 | "plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "##Exercise\n", 372 | "Compare the results of Isomap and PCA on a 5-class subset of the digits dataset (``load_digits(5)``).\n", 373 | "\n", 374 | "__Bonus__: Also compare to TSNE, another popular manifold learning technique." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "collapsed": true 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "from sklearn.datasets import load_digits\n", 386 | "\n", 387 | "digits = load_digits(5)\n", 388 | "\n", 389 | "X = digits.data\n", 390 | "# ..." 391 | ] 392 | } 393 | ], 394 | "metadata": { 395 | "kernelspec": { 396 | "display_name": "Python 2", 397 | "language": "python", 398 | "name": "python2" 399 | }, 400 | "language_info": { 401 | "codemirror_mode": { 402 | "name": "ipython", 403 | "version": 2 404 | }, 405 | "file_extension": ".py", 406 | "mimetype": "text/x-python", 407 | "name": "python", 408 | "nbconvert_exporter": "python", 409 | "pygments_lexer": "ipython2", 410 | "version": "2.7.10" 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 0 415 | } 416 | -------------------------------------------------------------------------------- /notebooks/02.4 Unsupervised Learning - Clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Clustering" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Clustering is the task of gathering samples into groups of similar\n", 28 | "samples according to some predefined similarity or dissimilarity\n", 29 | "measure (such as the Euclidean distance).\n", 30 | "In this section we will explore a basic clustering task on some synthetic and real datasets.\n", 31 | "\n", 32 | "Here are some common applications of clustering algorithms:\n", 33 | "\n", 34 | "- Compression, in a data reduction sens\n", 35 | "- Can be used as a preprocessing step for recommender systems\n", 36 | "- Similarly:\n", 37 | " - grouping related web news (e.g. Google News) and web search results\n", 38 | " - grouping related stock quotes for investment portfolio management\n", 39 | " - building customer profiles for market analysis\n", 40 | "- Building a code book of prototype samples for unsupervised feature extraction\n", 41 | "\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Let's start of with a very simple and obvious example:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "from sklearn.datasets import make_blobs\n", 60 | "X, y = make_blobs(random_state=42)\n", 61 | "X.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "plt.scatter(X[:, 0], X[:, 1])" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "There are clearly three separate groups of points in the data, and we would like to recover them using clustering.\n", 80 | "Even if the groups are obvious in the data, it is hard to find them when the data lives in a high-dimensional space." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Now we will use one of the simplest clustering algorithms, K-means.\n", 88 | "This is an iterative algorithm which searches for three cluster\n", 89 | "centers such that the distance from each point to its cluster is\n", 90 | "minimized.\n", 91 | "**Question:** what would you expect the output to look like?" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "from sklearn.cluster import KMeans\n", 103 | "\n", 104 | "kmeans = KMeans(n_clusters=3, random_state=42)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "We can get the cluster labels either by calling fit and then accessing the \n", 112 | "``labels_`` attribute of the K means estimator, or by calling ``fit_predict``.\n", 113 | "Either way, the result contains the ID of the cluster that each point is assigned to." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "labels = kmeans.fit_predict(X)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "labels" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "all(labels == kmeans.labels_)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "Let's visualize the assignments that have been found" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "plt.scatter(X[:, 0], X[:, 1], c=labels)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "Here, we are probably satisfied with the clustering. But in general we might want to have a more quantitative evaluation. How about we compare our cluster labels with the ground truth we got when generating the blobs?" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "from sklearn.metrics import confusion_matrix, accuracy_score\n", 183 | "print(accuracy_score(y, labels))\n", 184 | "print(confusion_matrix(y, labels))\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "np.mean(y == labels)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "Even though we recovered the partitioning of the data into clusters perfectly, the cluster IDs we assigned were arbitrary,\n", 203 | "and we can not hope to recover them. Therefore, we must use a different scoring metric, such as ``adjusted_rand_score``, which is invariant to permutations of the labels:" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "from sklearn.metrics import adjusted_rand_score\n", 215 | "adjusted_rand_score(y, labels)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "**Clustering comes with assumptions**: A clustering algorithm finds clusters by making assumptions with samples should be grouped together. Each algorithm makes different assumptions and the quality and interpretability of your results will depend on whether the assumptions are satisfied for your goal. For K-means clustering, the model is that all clusters have equal, spherical variance.\n", 223 | "\n", 224 | "**In general, there is no guarantee that structure found by a clustering algorithm has anything to do with what you were interested in**." 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "We can easily create a dataset that has non-isotropic clusters, on which kmeans will fail:" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "from sklearn.datasets import make_blobs\n", 243 | "\n", 244 | "X, y = make_blobs(random_state=170, n_samples=600)\n", 245 | "rng = np.random.RandomState(74)\n", 246 | "\n", 247 | "transformation = rng.normal(size=(2, 2))\n", 248 | "X = np.dot(X, transformation)\n", 249 | "\n", 250 | "y_pred = KMeans(n_clusters=3).fit_predict(X)\n", 251 | "\n", 252 | "plt.scatter(X[:, 0], X[:, 1], c=y_pred)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## Some Notable Clustering Routines" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "The following are two well-known clustering algorithms. \n", 267 | "\n", 268 | "- `sklearn.cluster.KMeans`:
\n", 269 | " The simplest, yet effective clustering algorithm. Needs to be provided with the\n", 270 | " number of clusters in advance, and assumes that the data is normalized as input\n", 271 | " (but use a PCA model as preprocessor).\n", 272 | "- `sklearn.cluster.MeanShift`:
\n", 273 | " Can find better looking clusters than KMeans but is not scalable to high number of samples.\n", 274 | "- `sklearn.cluster.DBSCAN`:
\n", 275 | " Can detect irregularly shaped clusters based on density, i.e. sparse regions in\n", 276 | " the input space are likely to become inter-cluster boundaries. Can also detect\n", 277 | " outliers (samples that are not part of a cluster).\n", 278 | "- `sklearn.cluster.AffinityPropagation`:
\n", 279 | " Clustering algorithm based on message passing between data points.\n", 280 | "- `sklearn.cluster.SpectralClustering`:
\n", 281 | " KMeans applied to a projection of the normalized graph Laplacian: finds\n", 282 | " normalized graph cuts if the affinity matrix is interpreted as an adjacency matrix of a graph.\n", 283 | "- `sklearn.cluster.Ward`:
\n", 284 | " Ward implements hierarchical clustering based on the Ward algorithm,\n", 285 | " a variance-minimizing approach. At each step, it minimizes the sum of\n", 286 | " squared differences within all clusters (inertia criterion).\n", 287 | "\n", 288 | "Of these, Ward, SpectralClustering, DBSCAN and Affinity propagation can also work with precomputed similarity matrices." 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "## Exercise: digits clustering" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Perform K-means clustering on the digits data, searching for ten clusters.\n", 310 | "Visualize the cluster centers as images (i.e. reshape each to 8x8 and use\n", 311 | "``plt.imshow``) Do the clusters seem to be correlated with particular digits? What is the ``adjusted_rand_score``?\n", 312 | "\n", 313 | "Visualize the projected digits as in the last notebook, but this time use the\n", 314 | "cluster labels as the color. What do you notice?" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "from sklearn.datasets import load_digits\n", 326 | "digits = load_digits()\n", 327 | "# ..." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "collapsed": false 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "# %load solutions/08B_digits_clustering.py" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "collapsed": true 346 | }, 347 | "outputs": [], 348 | "source": [] 349 | } 350 | ], 351 | "metadata": { 352 | "kernelspec": { 353 | "display_name": "Python 2", 354 | "language": "python", 355 | "name": "python2" 356 | }, 357 | "language_info": { 358 | "codemirror_mode": { 359 | "name": "ipython", 360 | "version": 2 361 | }, 362 | "file_extension": ".py", 363 | "mimetype": "text/x-python", 364 | "name": "python", 365 | "nbconvert_exporter": "python", 366 | "pygments_lexer": "ipython2", 367 | "version": "2.7.9" 368 | } 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 0 372 | } 373 | -------------------------------------------------------------------------------- /notebooks/02.5 Review of Scikit-learn API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "\n", 10 | "### A recap on Scikit-learn's estimator interface\n", 11 | "Scikit-learn strives to have a uniform interface across all methods. Given a scikit-learn *estimator*\n", 12 | "object named `model`, the following methods are available (not all for each model):\n", 13 | "\n", 14 | "- Available in **all Estimators**\n", 15 | " + `model.fit()` : fit training data. For supervised learning applications,\n", 16 | " this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n", 17 | " For unsupervised learning applications, ``fit`` takes only a single argument,\n", 18 | " the data `X` (e.g. `model.fit(X)`).\n", 19 | "- Available in **supervised estimators**\n", 20 | " + `model.predict()` : given a trained model, predict the label of a new set of data.\n", 21 | " This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n", 22 | " and returns the learned label for each object in the array.\n", 23 | " + `model.predict_proba()` : For classification problems, some estimators also provide\n", 24 | " this method, which returns the probability that a new observation has each categorical label.\n", 25 | " In this case, the label with the highest probability is returned by `model.predict()`.\n", 26 | " + `model.decision_function()` : For classification problems, some estimators provide an uncertainty estimate that is not a probability. For binary classification, a decision_function >= 0 means the positive class will be predicted, while < 0 means the negative class.\n", 27 | " + `model.score()` : for classification or regression problems, most (all?) estimators implement\n", 28 | " a score method. Scores are between 0 and 1, with a larger score indicating a better fit.\n", 29 | " + `model.transform()` : For feature selection algorithms, this will reduce the dataset to the selected features. For some classification and regression models such as some linear models and random forests, this method reduces the dataset to the most informative features. These classification and regression models can therefor also be used as feature selection methods.\n", 30 | " \n", 31 | "- Available in **unsupervised estimators**\n", 32 | " + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n", 33 | " This also accepts one argument `X_new`, and returns the new representation of the data based\n", 34 | " on the unsupervised model.\n", 35 | " + `model.fit_transform()` : some estimators implement this method,\n", 36 | " which more efficiently performs a fit and a transform on the same input data.\n", 37 | " + `model.predict()` : for clustering algorithms, the predict method will produce cluster labels for new data points. Not all clustering methods have this functionality.\n", 38 | " + `model.predict_proba()` : Gaussian mixture models (GMMs) provide the probability for each point to be generated by a given mixture component.\n", 39 | " + `model.score()` : Density models like KDE and GMMs provide the likelihood of the data under the model." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "source": [ 48 | "Apart from ``fit``, the two most important functions are arguably ``predict`` to produce a target variable (a ``y``) ``transform``, which produces a new representation of the data (an ``X``).\n", 49 | "The following table shows for which class of models which function applies:\n", 50 | "\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | "\n", 62 | "\n", 63 | "\n", 64 | "
``model.predict````model.transform``
ClassificationPreprocessing
RegressionDimensionality Reduction
ClusteringFeature Extraction
 Feature selection
\n", 65 | "\n", 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 2", 73 | "language": "python", 74 | "name": "python2" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 2 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython2", 86 | "version": "2.7.9" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 0 91 | } 92 | -------------------------------------------------------------------------------- /notebooks/03.1 Case Study - Supervised Classification of Handwritten Digits.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Supervised Learning: Classification of Handwritten Digits" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this section we'll apply scikit-learn to the classification of handwritten\n", 15 | "digits. This will go a bit beyond the iris classification we saw before: we'll\n", 16 | "discuss some of the metrics which can be used in evaluating the effectiveness\n", 17 | "of a classification model.\n", 18 | "\n", 19 | "We'll work with the handwritten digits dataset which we saw in an earlier\n", 20 | "section of the tutorial." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "digits = load_digits()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "We'll re-use some of our code from before to visualize the data and remind us what\n", 40 | "we're looking at:" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "%matplotlib inline\n", 52 | "import matplotlib.pyplot as plt" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "fig = plt.figure(figsize=(6, 6)) # figure size in inches\n", 64 | "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", 65 | "\n", 66 | "# plot the digits: each image is 8x8 pixels\n", 67 | "for i in range(64):\n", 68 | " ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])\n", 69 | " ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')\n", 70 | " \n", 71 | " # label the image with the target value\n", 72 | " ax.text(0, 7, str(digits.target[i]))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Visualizing the Data" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "A good first-step for many problems is to visualize the data using one of the\n", 87 | "*Dimensionality Reduction* techniques we saw earlier. We'll start with the\n", 88 | "most straightforward one, Principal Component Analysis (PCA).\n", 89 | "\n", 90 | "PCA seeks orthogonal linear combinations of the features which show the greatest\n", 91 | "variance, and as such, can help give you a good idea of the structure of the\n", 92 | "data set. Here we'll use `RandomizedPCA`, because it's faster for large `N`." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "from sklearn.decomposition import RandomizedPCA\n", 104 | "pca = RandomizedPCA(n_components=2, random_state=1999)\n", 105 | "proj = pca.fit_transform(digits.data)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)\n", 117 | "plt.colorbar()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Here we see that the digits do cluster fairly well, so we can expect even\n", 125 | "a fairly naive classification scheme to do a decent job separating them.\n", 126 | "\n", 127 | "A weakness of PCA is that it produces a linear dimensionality reduction:\n", 128 | "this may miss some interesting relationships in the data. If we want to\n", 129 | "see a nonlinear mapping of the data, we can use one of the several\n", 130 | "methods in the `manifold` module. Here we'll use Isomap (a concatenation\n", 131 | "of Isometric Mapping) which is a manifold learning method based on\n", 132 | "graph theory:" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from sklearn.manifold import Isomap\n", 144 | "iso = Isomap(n_neighbors=5, n_components=2)\n", 145 | "proj = iso.fit_transform(digits.data)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)\n", 157 | "plt.colorbar()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "It can be fun to explore the various manifold learning methods available,\n", 165 | "and how the output depends on the various parameters used to tune the\n", 166 | "projection.\n", 167 | "In any case, these visualizations show us that there is hope: even a simple\n", 168 | "classifier should be able to adequately identify the members of the various\n", 169 | "classes." 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "**Question: Given these projections of the data, which numbers do you think\n", 177 | "a classifier might have trouble distinguishing?**" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Gaussian Naive Bayes Classification" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "For most classification problems, it's nice to have a simple, fast, go-to\n", 192 | "method to provide a quick baseline classification. If the simple and fast\n", 193 | "method is sufficient, then we don't have to waste CPU cycles on more complex\n", 194 | "models. If not, we can use the results of the simple method to give us\n", 195 | "clues about our data.\n", 196 | "\n", 197 | "One good method to keep in mind is Gaussian Naive Bayes. It is a *generative*\n", 198 | "classifier which fits an axis-aligned multi-dimensional Gaussian distribution to\n", 199 | "each training label, and uses this to quickly give a rough classification. It\n", 200 | "is generally not sufficiently accurate for real-world data, but can perform surprisingly well." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "from sklearn.naive_bayes import GaussianNB\n", 212 | "from sklearn.cross_validation import train_test_split" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "# split the data into training and validation sets\n", 224 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, random_state=0)\n", 225 | "\n", 226 | "# train the model\n", 227 | "clf = GaussianNB()\n", 228 | "clf.fit(X_train, y_train)\n", 229 | "\n", 230 | "# use the model to predict the labels of the test data\n", 231 | "predicted = clf.predict(X_test)\n", 232 | "expected = y_test" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "**Question**: why did we split the data into training and validation sets?" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Let's plot the digits again with the predicted labels to get an idea of\n", 247 | "how well the classification is working:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "fig = plt.figure(figsize=(6, 6)) # figure size in inches\n", 259 | "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", 260 | "\n", 261 | "# plot the digits: each image is 8x8 pixels\n", 262 | "for i in range(64):\n", 263 | " ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])\n", 264 | " ax.imshow(X_test.reshape(-1, 8, 8)[i], cmap=plt.cm.binary,\n", 265 | " interpolation='nearest')\n", 266 | " \n", 267 | " # label the image with the target value\n", 268 | " if predicted[i] == expected[i]:\n", 269 | " ax.text(0, 7, str(predicted[i]), color='green')\n", 270 | " else:\n", 271 | " ax.text(0, 7, str(predicted[i]), color='red')" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Quantitative Measurement of Performance" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "We'd like to measure the performance of our estimator without having to resort\n", 286 | "to plotting examples. A simple method might be to simply compare the number of\n", 287 | "matches:" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "collapsed": false 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "matches = (predicted == expected)\n", 299 | "print(matches.sum())\n", 300 | "print(len(matches))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "matches.sum() / float(len(matches))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "We see that nearly 1500 of the 1800 predictions match the input. But there are other\n", 319 | "more sophisticated metrics that can be used to judge the performance of a classifier:\n", 320 | "several are available in the ``sklearn.metrics`` submodule.\n", 321 | "\n", 322 | "We can also use ``clf.score`` as a helper method to calculate how well the classifier performs." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "print(clf.score(X_test, y_test))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 2", 349 | "language": "python", 350 | "name": "python2" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 2 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython2", 362 | "version": "2.7.10" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 0 367 | } 368 | -------------------------------------------------------------------------------- /notebooks/03.2 Methods - Unsupervised Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example from Image Processing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Using PCA to extract features" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Now we'll take a look at unsupervised learning on a facial recognition example.\n", 34 | "This uses a dataset available within scikit-learn consisting of a\n", 35 | "subset of the [Labeled Faces in the Wild](http://vis-www.cs.umass.edu/lfw/)\n", 36 | "data. Note that this is a relatively large download (~200MB) so it may\n", 37 | "take a while to execute." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "(1288, 1850)" 51 | ] 52 | }, 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "from sklearn import datasets\n", 60 | "lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4,\n", 61 | " data_home='datasets')\n", 62 | "lfw_people.data.shape" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Let's visualize these faces to see what we're working with:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "fig = plt.figure(figsize=(8, 6))\n", 81 | "# plot several images\n", 82 | "for i in range(15):\n", 83 | " ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])\n", 84 | " ax.imshow(lfw_people.images[i], cmap=plt.cm.bone)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "We'll do a typical train-test split on the images before performing unsupervised learning:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "from sklearn.cross_validation import train_test_split\n", 103 | "X_train, X_test, y_train, y_test = train_test_split(lfw_people.data, lfw_people.target, random_state=0)\n", 104 | "\n", 105 | "print(X_train.shape, X_test.shape)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## Feature Reduction Using Principal Component Analysis" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "We can use PCA to reduce the original 1850 features of the face images to a manageable\n", 120 | "size, while maintaining most of the information in the dataset. Here it is useful to use a variant\n", 121 | "of PCA called ``RandomizedPCA``, which is an approximation of PCA that can be much faster for large\n", 122 | "datasets." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from sklearn import decomposition\n", 134 | "pca = decomposition.RandomizedPCA(n_components=150, whiten=True)\n", 135 | "pca.fit(X_train)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "One interesting part of PCA is that it computes the \"mean\" face, which can be\n", 143 | "interesting to examine:" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "plt.imshow(pca.mean_.reshape((50, 37)), cmap=plt.cm.bone)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "The principal components measure deviations about this mean along orthogonal axes.\n", 162 | "It is also interesting to visualize these principal components:" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "print(pca.components_.shape)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "fig = plt.figure(figsize=(16, 6))\n", 185 | "for i in range(30):\n", 186 | " ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])\n", 187 | " ax.imshow(pca.components_[i].reshape((50, 37)), cmap=plt.cm.bone)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "The components (\"eigenfaces\") are ordered by their importance from top-left to bottom-right.\n", 195 | "We see that the first few components seem to primarily take care of lighting\n", 196 | "conditions; the remaining components pull out certain identifying features:\n", 197 | "the nose, eyes, eyebrows, etc." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "With this projection computed, we can now project our original training\n", 205 | "and test data onto the PCA basis:" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "X_train_pca = pca.transform(X_train)\n", 217 | "X_test_pca = pca.transform(X_test)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "print(X_train_pca.shape)\n", 229 | "print(X_test_pca.shape)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "These projected components correspond to factors in a linear combination of\n", 237 | "component images such that the combination approaches the original face." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [] 248 | } 249 | ], 250 | "metadata": { 251 | "kernelspec": { 252 | "display_name": "Python 2", 253 | "language": "python", 254 | "name": "python2" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 2 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython2", 266 | "version": "2.7.6" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 0 271 | } 272 | -------------------------------------------------------------------------------- /notebooks/03.3 Case Study - Face Recognition with Eigenfaces.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example from Image Processing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "Here we'll take a look at a simple facial recognition example.\n", 27 | "This uses a dataset available within scikit-learn consisting of a\n", 28 | "subset of the [Labeled Faces in the Wild](http://vis-www.cs.umass.edu/lfw/)\n", 29 | "data. Note that this is a relatively large download (~200MB) so it may\n", 30 | "take a while to execute." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from sklearn import datasets\n", 42 | "lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4,\n", 43 | " data_home='datasets')\n", 44 | "lfw_people.data.shape" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "If you're on a unix-based system such as linux or Mac OSX, these shell commands\n", 52 | "can be used to see the downloaded dataset:" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "!ls datasets" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "!du -sh datasets/lfw_home" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Once again, let's visualize these faces to see what we're working with:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "fig = plt.figure(figsize=(8, 6))\n", 93 | "# plot several images\n", 94 | "for i in range(15):\n", 95 | " ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])\n", 96 | " ax.imshow(lfw_people.images[i], cmap=plt.cm.bone)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "import numpy as np\n", 108 | "plt.figure(figsize=(10, 2))\n", 109 | "\n", 110 | "unique_targets = np.unique(lfw_people.target)\n", 111 | "counts = [(lfw_people.target == i).sum() for i in unique_targets]\n", 112 | "\n", 113 | "plt.xticks(unique_targets, lfw_people.target_names[unique_targets])\n", 114 | "locs, labels = plt.xticks()\n", 115 | "plt.setp(labels, rotation=45, size=14)\n", 116 | "_ = plt.bar(unique_targets, counts)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "One thing to note is that these faces have already been localized and scaled\n", 124 | "to a common size. This is an important preprocessing piece for facial\n", 125 | "recognition, and is a process that can require a large collection of training\n", 126 | "data. This can be done in scikit-learn, but the challenge is gathering a\n", 127 | "sufficient amount of training data for the algorithm to work\n", 128 | "\n", 129 | "Fortunately, this piece is common enough that it has been done. One good\n", 130 | "resource is [OpenCV](http://opencv.willowgarage.com/wiki/FaceRecognition), the\n", 131 | "*Open Computer Vision Library*." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "We'll perform a Support Vector classification of the images. We'll\n", 139 | "do a typical train-test split on the images to make this happen:" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "from sklearn.cross_validation import train_test_split\n", 151 | "X_train, X_test, y_train, y_test = train_test_split(\n", 152 | " lfw_people.data, lfw_people.target, random_state=0)\n", 153 | "\n", 154 | "print(X_train.shape, X_test.shape)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## Preprocessing: Principal Component Analysis" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "1850 dimensions is a lot for SVM. We can use PCA to reduce these 1850 features to a manageable\n", 169 | "size, while maintaining most of the information in the dataset. Here it is useful to use a variant\n", 170 | "of PCA called ``RandomizedPCA``, which is an approximation of PCA that can be much faster for large\n", 171 | "datasets. We saw this method in the previous notebook, and will use it again here:" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "from sklearn import decomposition\n", 183 | "pca = decomposition.RandomizedPCA(n_components=150, whiten=True,\n", 184 | " random_state=1999)\n", 185 | "pca.fit(X_train)\n", 186 | "X_train_pca = pca.transform(X_train)\n", 187 | "X_test_pca = pca.transform(X_test)\n", 188 | "print(X_train_pca.shape)\n", 189 | "print(X_test_pca.shape)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "These projected components correspond to factors in a linear combination of\n", 197 | "component images such that the combination approaches the original face. In general, PCA can be a powerful technique for preprocessing that can greatly improve classification performance." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Doing the Learning: Support Vector Machines" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "Now we'll perform support-vector-machine classification on this reduced dataset:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "from sklearn import svm\n", 223 | "clf = svm.SVC(C=5., gamma=0.001)\n", 224 | "clf.fit(X_train_pca, y_train)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "Finally, we can evaluate how well this classification did. First, we might plot a\n", 232 | "few of the test-cases with the labels learned from the training set:" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "fig = plt.figure(figsize=(8, 6))\n", 244 | "for i in range(15):\n", 245 | " ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])\n", 246 | " ax.imshow(X_test[i].reshape((50, 37)), cmap=plt.cm.bone)\n", 247 | " y_pred = clf.predict(X_test_pca[i])[0]\n", 248 | " color = 'black' if y_pred == y_test[i] else 'red'\n", 249 | " ax.set_title(lfw_people.target_names[y_pred], fontsize='small', color=color)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "The classifier is correct on an impressive number of images given the simplicity\n", 257 | "of its learning model! Using a linear classifier on 150 features derived from\n", 258 | "the pixel-level data, the algorithm correctly identifies a large number of the\n", 259 | "people in the images.\n", 260 | "\n", 261 | "Again, we can\n", 262 | "quantify this effectiveness using ``clf.score``" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "print(clf.score(X_test_pca, y_test))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "## Final Note" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "Here we have used PCA \"eigenfaces\" as a pre-processing step for facial recognition.\n", 288 | "The reason we chose this is because PCA is a broadly-applicable technique, which can\n", 289 | "be useful for a wide array of data types. For more details on the eigenfaces approach, see the original paper by [Turk and Penland, Eigenfaces for Recognition](http://www.face-rec.org/algorithms/PCA/jcn.pdf). Research in the field of facial recognition has moved much farther beyond this paper, and has shown specific feature extraction methods can be more effective. However, eigenfaces is a canonical example of machine learning \"in the wild\", and is a simple method with good results." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [] 300 | } 301 | ], 302 | "metadata": { 303 | "kernelspec": { 304 | "display_name": "Python 2", 305 | "language": "python", 306 | "name": "python2" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 2 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython2", 318 | "version": "2.7.10" 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 0 323 | } 324 | -------------------------------------------------------------------------------- /notebooks/03.4 Methods - Text Feature Extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Text Feature Extraction with Bag-of-Words\n", 21 | "In many tasks, like in the classical spam detection, your input data is text.\n", 22 | "Free text with variables length is very far from the fixed length numeric representation that we need to do machine learning with scikit-learn.\n", 23 | "However, there is an easy and effective way to go from text data to a numeric representation that we can use with our models, called bag-of-words." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Lets assume that each sample in your dataset is represented as one string, which could be just a sentence, an email or a whole news article or book. To represent the sample, we first split the string into a list of tokens, which correspond to (somewhat normalized) words. A simple way to do this to just split by whitespace, and then lowercase the word.\n", 38 | "Then, we built a vocabulary of all tokens (lowercased words) that appear in our whole dataset. This is usually a very large vocabulary.\n", 39 | "Finally, looking at our single sample, we could how often each word in the vocabulary appears.\n", 40 | "We represent our string by a vector, where each entry is how often a given word in the vocabular appears in the string.\n", 41 | "\n", 42 | "As each sample will only contain very few of the words, most entries will be zero, leading to a very high-dimensional but sparse representation.\n", 43 | "\n", 44 | "The method is called bag-of-words as the order of the words is lost entirely." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "X = [\"Some say the world will end in fire,\",\n", 56 | " \"Some say in ice.\"]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "len(X)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "from sklearn.feature_extraction.text import CountVectorizer\n", 79 | "\n", 80 | "vectorizer = CountVectorizer()\n", 81 | "vectorizer.fit(X)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "vectorizer.vocabulary_" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "X_bag_of_words = vectorizer.transform(X)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "X_bag_of_words.shape" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "X_bag_of_words" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "X_bag_of_words.toarray()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "vectorizer.get_feature_names()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "vectorizer.inverse_transform(X_bag_of_words)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Tfidf Encoding\n", 166 | "A useful transformation that is often applied to the bag-of-word encoding is the so-called term-frequency inverse-document-frequency (Tfidf) scaling, which is a non-linear transformation of the word counts.\n", 167 | "\n", 168 | "The Tfidf encoding rescales words that are common to have less weight:" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 180 | "\n", 181 | "tfidf_vectorizer = TfidfVectorizer()\n", 182 | "tfidf_vectorizer.fit(X)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "import numpy as np\n", 194 | "np.set_printoptions(precision=2)\n", 195 | "\n", 196 | "print(tfidf_vectorizer.transform(X).toarray())" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "# Bigrams and N-Grams\n", 204 | "Entirely discarding word order is not always a good idea, as composite phrases often have specific meaning, and modifiers like \"not\" can invert the meaning of words.\n", 205 | "A simple way to include some word order are n-grams, which don't only look at a single token, but at all pairs of neighborhing tokens:" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "# look at sequences of tokens of minimum length 2 and maximum length 2\n", 217 | "bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))\n", 218 | "bigram_vectorizer.fit(X)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "bigram_vectorizer.get_feature_names()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "bigram_vectorizer.transform(X).toarray()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "Often we want to include unigrams (sigle tokens) and bigrams:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "gram_vectorizer = CountVectorizer(ngram_range=(1, 2))\n", 259 | "gram_vectorizer.fit(X)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "gram_vectorizer.get_feature_names()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": false 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "gram_vectorizer.transform(X).toarray()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "Character n-grams\n", 289 | "=================\n", 290 | "Sometimes it is also helpful to not look at words, but instead single character.\n", 291 | "That is particularly useful if you have very noisy data, want to identify the language, or we want to predict something about a single word.\n", 292 | "We can simply look at characters instead of words by setting ``analyzer=\"char\"``.\n", 293 | "Looking at single characters is usually not very informative, but looking at longer n-grams of characters can be:" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "char_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer=\"char\")\n", 305 | "char_vectorizer.fit(X)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": false, 313 | "scrolled": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "print(char_vectorizer.get_feature_names())" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [] 328 | } 329 | ], 330 | "metadata": { 331 | "kernelspec": { 332 | "display_name": "Python 2", 333 | "language": "python", 334 | "name": "python2" 335 | }, 336 | "language_info": { 337 | "codemirror_mode": { 338 | "name": "ipython", 339 | "version": 2 340 | }, 341 | "file_extension": ".py", 342 | "mimetype": "text/x-python", 343 | "name": "python", 344 | "nbconvert_exporter": "python", 345 | "pygments_lexer": "ipython2", 346 | "version": "2.7.6" 347 | } 348 | }, 349 | "nbformat": 4, 350 | "nbformat_minor": 0 351 | } 352 | -------------------------------------------------------------------------------- /notebooks/03.5 Case Study - SMS Spam Detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Text classification for SMS spam detection" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "import os\n", 32 | "with open(os.path.join(\"datasets\", \"smsspam\", \"SMSSpamCollection\")) as f:\n", 33 | " lines = [line.strip().split(\"\\t\") for line in f.readlines()]\n", 34 | "text = [x[1] for x in lines]\n", 35 | "y = [x[0] == \"ham\" for x in lines]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false, 43 | "scrolled": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "text[:10]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false, 55 | "scrolled": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "y[:10]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "type(text)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "type(y)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from sklearn.cross_validation import train_test_split\n", 93 | "\n", 94 | "text_train, text_test, y_train, y_test = train_test_split(text, y, random_state=42)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "from sklearn.feature_extraction.text import CountVectorizer\n", 106 | "\n", 107 | "vectorizer = CountVectorizer()\n", 108 | "vectorizer.fit(text_train)\n", 109 | "\n", 110 | "X_train = vectorizer.transform(text_train)\n", 111 | "X_test = vectorizer.transform(text_test)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false, 119 | "scrolled": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "print(len(vectorizer.vocabulary_))\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "X_train.shape" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "print(vectorizer.get_feature_names()[:20])\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "print(vectorizer.get_feature_names()[3000:3020])" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "print(X_train.shape)\n", 168 | "print(X_test.shape)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Training a Classifier on Text Features" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "We can now train a classifier, for instance a logistic regression classifier which is a fast baseline for text classification tasks:" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.linear_model import LogisticRegression\n", 194 | "\n", 195 | "clf = LogisticRegression()\n", 196 | "clf" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "clf.fit(X_train, y_train)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "We can now evaluate the classifier on the testing set. Let's first use the builtin score function, which is the rate of correct classification in the test set:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "clf.score(X_test, y_test)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "We can also compute the score on the training set, to see how well we do there:" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "clf.score(X_train, y_train)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "#Visualizing important features" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n", 262 | " # get coefficients with large absolute values \n", 263 | " coef = classifier.coef_.ravel()\n", 264 | " positive_coefficients = np.argsort(coef)[-n_top_features:]\n", 265 | " negative_coefficients = np.argsort(coef)[:n_top_features]\n", 266 | " interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n", 267 | " # plot them\n", 268 | " plt.figure(figsize=(15, 5))\n", 269 | " colors = [\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]]\n", 270 | " plt.bar(np.arange(50), coef[interesting_coefficients], color=colors)\n", 271 | " feature_names = np.array(feature_names)\n", 272 | " plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "visualize_coefficients(clf, vectorizer.get_feature_names())\n" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "vectorizer = CountVectorizer(min_df=2)\n", 295 | "vectorizer.fit(text_train)\n", 296 | "\n", 297 | "X_train = vectorizer.transform(text_train)\n", 298 | "X_test = vectorizer.transform(text_test)\n", 299 | "\n", 300 | "clf = LogisticRegression()\n", 301 | "clf.fit(X_train, y_train)\n", 302 | "\n", 303 | "print(clf.score(X_train, y_train))\n", 304 | "print(clf.score(X_test, y_test))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "visualize_coefficients(clf, vectorizer.get_feature_names())\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "# Exercises\n", 330 | "\n", 331 | "Use TfidfVectorizer instead of CountVectorizer. Are the results better? How are the coefficients different?\n", 332 | "\n", 333 | "Change the parameters min_df and ngram_range of the TfidfVectorizer and CountVectorizer. How does that change the important features?\n", 334 | "\n" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": true 342 | }, 343 | "outputs": [], 344 | "source": [] 345 | } 346 | ], 347 | "metadata": { 348 | "kernelspec": { 349 | "display_name": "Python 2", 350 | "language": "python", 351 | "name": "python2" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 2 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython2", 363 | "version": "2.7.6" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 0 368 | } 369 | -------------------------------------------------------------------------------- /notebooks/03.6 Case Study - Titanic Survival.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Extraction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Here we will talk about an important piece of machine learning: the extraction of\n", 15 | "quantitative features from data. By the end of this section you will\n", 16 | "\n", 17 | "- Know how features are extracted from real-world data.\n", 18 | "- See an example of extracting numerical features from textual data\n", 19 | "\n", 20 | "In addition, we will go over several basic tools within scikit-learn which can be used to accomplish the above tasks." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## What Are Features?" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Numerical Features" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Recall that data in scikit-learn is expected to be in two-dimensional arrays, of size\n", 42 | "**n_samples** $\\times$ **n_features**.\n", 43 | "\n", 44 | "Previously, we looked at the iris dataset, which has 150 samples and 4 features" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "from sklearn.datasets import load_iris\n", 56 | "iris = load_iris()\n", 57 | "print(iris.data.shape)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "These features are:\n", 65 | "\n", 66 | "- sepal length in cm\n", 67 | "- sepal width in cm\n", 68 | "- petal length in cm\n", 69 | "- petal width in cm\n", 70 | "\n", 71 | "Numerical features such as these are pretty straightforward: each sample contains a list\n", 72 | "of floating-point numbers corresponding to the features" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Categorical Features" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "What if you have categorical features? For example, imagine there is data on the color of each\n", 87 | "iris:\n", 88 | "\n", 89 | " color in [red, blue, purple]\n", 90 | "\n", 91 | "You might be tempted to assign numbers to these features, i.e. *red=1, blue=2, purple=3*\n", 92 | "but in general **this is a bad idea**. Estimators tend to operate under the assumption that\n", 93 | "numerical features lie on some continuous scale, so, for example, 1 and 2 are more alike\n", 94 | "than 1 and 3, and this is often not the case for categorical features.\n", 95 | "\n", 96 | "A better strategy is to give each category its own dimension. \n", 97 | "The enriched iris feature set would hence be in this case:\n", 98 | "\n", 99 | "- sepal length in cm\n", 100 | "- sepal width in cm\n", 101 | "- petal length in cm\n", 102 | "- petal width in cm\n", 103 | "- color=purple (1.0 or 0.0)\n", 104 | "- color=blue (1.0 or 0.0)\n", 105 | "- color=red (1.0 or 0.0)\n", 106 | "\n", 107 | "Note that using many of these categorical features may result in data which is better\n", 108 | "represented as a **sparse matrix**, as we'll see with the text classification example\n", 109 | "below." 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "#### Using the DictVectorizer to encode categorical features" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "When the source data is encoded has a list of dicts where the values are either strings names for categories or numerical values, you can use the `DictVectorizer` class to compute the boolean expansion of the categorical features while leaving the numerical features unimpacted:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "measurements = [\n", 135 | " {'city': 'Dubai', 'temperature': 33.},\n", 136 | " {'city': 'London', 'temperature': 12.},\n", 137 | " {'city': 'San Francisco', 'temperature': 18.},\n", 138 | "]" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "from sklearn.feature_extraction import DictVectorizer\n", 150 | "\n", 151 | "vec = DictVectorizer()\n", 152 | "vec" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "vec.fit_transform(measurements).toarray()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "vec.get_feature_names()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "### Derived Features" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Another common feature type are **derived features**, where some pre-processing step is\n", 189 | "applied to the data to generate features that are somehow more informative. Derived\n", 190 | "features may be based in **dimensionality reduction** (such as PCA or manifold learning),\n", 191 | "may be linear or nonlinear combinations of features (such as in Polynomial regression),\n", 192 | "or may be some more sophisticated transform of the features. The latter is often used\n", 193 | "in image processing.\n", 194 | "\n", 195 | "For example, [scikit-image](http://scikit-image.org/) provides a variety of feature\n", 196 | "extractors designed for image data: see the ``skimage.feature`` submodule.\n", 197 | "We will see some *dimensionality*-based feature extraction routines later in the tutorial." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### Combining Numerical and Categorical Features" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "As an example of how to work with both categorical and numerical data, we will perform survival predicition for the passengers of the HMS Titanic.\n", 212 | "\n", 213 | "We will use a version of the Titanic (titanic3.xls) dataset from Thomas Cason, as retrieved from Frank Harrell's webpage [here](http://lib.stat.cmu.edu/S/Harrell/data/descriptions/titanic.html). We converted the .xls to .csv for easier manipulation without involving external libraries, but the data is otherwise unchanged.\n", 214 | "\n", 215 | "We need to read in all the lines from the (titanic3.csv) file, set aside the keys from the first line, and find our labels (who survived or died) and data (attributes of that person). Let's look at the keys and some corresponding example lines." 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "import os\n", 227 | "f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))\n", 228 | "print(f.readline())\n", 229 | "lines = []\n", 230 | "for i in range(3):\n", 231 | " lines.append(f.readline())\n", 232 | "print(lines)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "The site [linked here](http://lib.stat.cmu.edu/S/Harrell/data/descriptions/titanic3info.txt) gives a broad description of the keys and what they mean - we show it here for completeness\n", 240 | "\n", 241 | "```\n", 242 | "pclass Passenger Class\n", 243 | " (1 = 1st; 2 = 2nd; 3 = 3rd)\n", 244 | "survival Survival\n", 245 | " (0 = No; 1 = Yes)\n", 246 | "name Name\n", 247 | "sex Sex\n", 248 | "age Age\n", 249 | "sibsp Number of Siblings/Spouses Aboard\n", 250 | "parch Number of Parents/Children Aboard\n", 251 | "ticket Ticket Number\n", 252 | "fare Passenger Fare\n", 253 | "cabin Cabin\n", 254 | "embarked Port of Embarkation\n", 255 | " (C = Cherbourg; Q = Queenstown; S = Southampton)\n", 256 | "boat Lifeboat\n", 257 | "body Body Identification Number\n", 258 | "home.dest Home/Destination\n", 259 | "```\n", 260 | "\n", 261 | "In general, it looks like `name`, `sex`, `cabin`, `embarked`, `boat`, `body`, and `homedest` may be candidates for categorical features, while the rest appear to be numerical features. We can now write a function to extract features from a text line, shown below." 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "Let's process an example line using the `process_titanic_line` function from `helpers` to see the expected output." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "from helpers import process_titanic_line\n", 280 | "print(process_titanic_line(lines[0]))" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "Now that we see the expected format from the line, we can call a dataset helper which uses this processing to read in the whole dataset. See ``helpers.py`` for more details." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "collapsed": false 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "from helpers import load_titanic\n", 299 | "keys, train_data, test_data, train_labels, test_labels = load_titanic(\n", 300 | " test_size=0.2, feature_skip_tuple=(), random_state=1999)\n", 301 | "print(\"Key list: %s\" % keys)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "With all of the hard data loading work out of the way, evaluating a classifier on this data becomes straightforward. Setting up the simplest possible model, we want to see what the simplest score can be with `DummyClassifier`." 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "from sklearn.metrics import accuracy_score\n", 320 | "from sklearn.dummy import DummyClassifier\n", 321 | "clf = DummyClassifier('most_frequent')\n", 322 | "clf.fit(train_data, train_labels)\n", 323 | "pred_labels = clf.predict(test_data)\n", 324 | "print(\"Prediction accuracy: %f\" % accuracy_score(pred_labels, test_labels))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "source": [ 333 | "Exercise\n", 334 | "=====\n", 335 | "Try executing the above classification, using RandomForestClassifier instead of DummyClassifier\n", 336 | "\n", 337 | "Can you remove or create new features to improve your score? Try printing feature importance as shown in this [sklearn example](http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html) and removing features by adding arguments to feature_skip_tuple." 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": true 345 | }, 346 | "outputs": [], 347 | "source": [] 348 | } 349 | ], 350 | "metadata": { 351 | "kernelspec": { 352 | "display_name": "Python 2", 353 | "language": "python", 354 | "name": "python2" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 2 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython2", 366 | "version": "2.7.10" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 0 371 | } 372 | -------------------------------------------------------------------------------- /notebooks/05.1 In Depth - Linear Models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# In depth with linear models" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Linear models are useful when little data is available or for very large feature spaces, as in text classification. In addition, they form a good case study for regularization." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Linear models for regression" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "All linear models for regression learn a coefficient parameter ``coef_`` and an offset ``intercept_`` to make predictions using a linear combination of features:\n", 42 | "```\n", 43 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_\n", 44 | "```" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "The difference between the linear models for regression is what kind of restrictions are put on ``coef_`` and ``intercept_`` (know as regularization), in addition to fitting the training data well." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "The most standard linear model is the 'ordinary least squares regression', often simply called 'linear regression'. It doesn't put any additional restrictions on ``coef_``, so when the number of features is large, it becomes ill-posed and the model overfits.\n", 59 | "\n", 60 | "Let us generate a simple simulation, to see the behavior of these models." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "rng = np.random.RandomState(4)\n", 72 | "X = rng.normal(size=(1000, 50))\n", 73 | "beta = rng.normal(size=50)\n", 74 | "y = np.dot(X, beta) + 4 * rng.normal(size=1000)\n", 75 | "\n", 76 | "from sklearn.utils import shuffle\n", 77 | "X, y = shuffle(X, y)\n", 78 | "\n", 79 | "from sklearn import linear_model, cross_validation\n", 80 | "from sklearn.learning_curve import learning_curve\n", 81 | "\n", 82 | "def plot_learning_curve(est, X, y):\n", 83 | " training_set_size, train_scores, test_scores = learning_curve(est, X, y, train_sizes=np.linspace(.1, 1, 30))\n", 84 | " estimator_name = est.__class__.__name__\n", 85 | " line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', label=\"training scores \" + estimator_name)\n", 86 | " plt.plot(training_set_size, test_scores.mean(axis=1), '-', label=\"test scores \" + estimator_name, c=line[0].get_color())\n", 87 | " plt.xlabel('Training set size')\n", 88 | " plt.legend(loc='best')\n", 89 | " #plt.ylim(-1, 1)\n", 90 | " \n", 91 | "plot_learning_curve(linear_model.LinearRegression(), X, y)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "As we can see, the ordinary linear regression is not defined if there are less training samples than features. In the presence of noise, it does poorly as long as the number of samples is not several times the number of features.\n", 99 | "\n", 100 | "The LinearRegression is then overfitting: fitting noise. We need to regularize.\n", 101 | "\n", 102 | "**The Ridge estimator** is a simple regularization (called l2 penalty) of the ordinary LinearRegression. In particular, it has the benefit of being not computationally more expensive than the ordinary least square estimate." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "plot_learning_curve(linear_model.LinearRegression(), X, y)\n", 114 | "plot_learning_curve(linear_model.Ridge(alpha=20), X, y)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "We can see that in the low-sample limit, the Ridge estimator performs much better than the unregularized model.\n", 122 | "\n", 123 | "The regularization of the Ridge is a shrinkage: the coefficients learned are biased towards zero. Too much bias is not beneficial, but with very few samples, we will need more bias.\n", 124 | "\n", 125 | "The amount of regularization is set via the `alpha` parameter of the Ridge. Tuning it is critical for performance. We can set it automatically by cross-validation using the RidgeCV estimator:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "plot_learning_curve(linear_model.LinearRegression(), X, y)\n", 137 | "plot_learning_curve(linear_model.Ridge(), X, y)\n", 138 | "plot_learning_curve(linear_model.RidgeCV(), X, y)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "**The Lasso estimator** is useful to impose sparsity on the coefficient. In other words, it is to be prefered if we believe that many of the features are not relevant. This is done via the so-called l1 penalty.\n", 146 | "\n", 147 | "Let us create such a situation with a new simulation where only 10 out of the 50 features are relevant:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "beta[10:] = 0\n", 159 | "y = np.dot(X, beta) + 4*rng.normal(size=1000)\n", 160 | "plot_learning_curve(linear_model.Ridge(), X, y)\n", 161 | "plot_learning_curve(linear_model.Lasso(), X, y)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "We can see that the Lasso estimator performs, in our case, better than the Ridge when there are a small number of training observations. However when there are a lot of observations the Lasso under-performs. Indeed, the variance-reducing effect of the regularization is less critical in these situations, and the bias becomes too detrimental.\n", 169 | "\n", 170 | "As with any estimator, we should tune the regularization parameter to get the best prediction. For this purpose, we can use the LassoCV object. Note that it is a significantly more computationally costly operation than the RidgeCV. To speed it up, we reduce the number of values explored for the alpha parameter." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "plot_learning_curve(linear_model.RidgeCV(), X, y)\n", 182 | "plot_learning_curve(linear_model.LassoCV(n_alphas=20), X, y)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "**ElasticNet** sits in between Lasso and Ridge. It has a tuning parameter, l1_ratio, that controls this behavior: when set to 0 (only l2 penalty), ElasticNet is a Ridge, when set to 1 (only l1 penalty), it is a Lasso. It is useful when your coefficients are not that sparse. The sparser the coefficients, the higher we should set l1_ratio. Note that l1_ratio can also be set by cross-validation, although we won't do it here to limit computational cost." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "plt.figure(figsize=(10, 5))\n", 201 | "plot_learning_curve(linear_model.RidgeCV(), X, y)\n", 202 | "plot_learning_curve(linear_model.ElasticNetCV(l1_ratio=.6, n_alphas=20), X, y)\n", 203 | "plot_learning_curve(linear_model.LassoCV(n_alphas=20), X, y)\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "#Exercise\n", 211 | "Find the best linear regression prediction on the `diabetes` dataset, that is available in the scikit-learn datasets." 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Linear models for classification" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "All linear models for classification learn a coefficient parameter ``coef_`` and an offset ``intercept_`` to make predictions using a linear combination of features:\n", 226 | "```\n", 227 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0\n", 228 | "```\n", 229 | "As you can see, this is very similar to regression, only that a threshold at zero is applied.\n", 230 | "\n", 231 | "Again, the difference between the linear models for classification what kind of regularization is put on ``coef_`` and ``intercept_``, but there are also minor differences in how the fit to the training set is measured (the so-called loss function).\n", 232 | "\n", 233 | "The two most common models for linear classification are the linear SVM as implemented in LinearSVC and LogisticRegression.\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "**Regularization**: the linear classifiers can suffer from over-fit in the presence of many features and must be regularized. The 'C' parameter controls that regularization: large C values give unregularized model, while small C give strongly regularized models.\n", 241 | "\n", 242 | "A good intuition for regularization of linear classifiers is that with high regularization, it is enough if most of the points are classified correctly. But with less regularization, more importance is given to each individual data point.\n", 243 | "This is illustrated using an linear SVM with different values of ``C`` below.\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "from figures import plot_linear_svc_regularization\n", 255 | "plot_linear_svc_regularization()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "\n", 263 | "Similar to the Ridge/Lasso separation, you can set the 'penalty' parameter to 'l1' to enforce sparsity of the coefficients.\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "#Exercise\n", 271 | "Use LogisticRegression to classify digits, and grid-search the C parameter." 272 | ] 273 | } 274 | ], 275 | "metadata": { 276 | "kernelspec": { 277 | "display_name": "Python 2", 278 | "language": "python", 279 | "name": "python2" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 2 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython2", 291 | "version": "2.7.9" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 0 296 | } 297 | -------------------------------------------------------------------------------- /notebooks/05.2 In Depth - Support Vector Machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# In depth with SVMs: Support Vector Machines" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "SVM stands for \"support vector machines\". They are efficient and easy to use estimators.\n", 28 | "They come in two kinds: SVCs, Support Vector Classifiers, for classification problems, and SVRs, Support Vector Regressors, for regression problems." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Linear SVMs" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "The SVM module contains LinearSVC, which we already discussed briefly in the section on linear models.\n", 43 | "Using ``SVC(kernel=\"linear\")`` will also yield a linear predictor that is only different in minor technical aspects." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Kernel SVMs\n", 51 | "The real power of SVMs lies in using kernels, which allow for non-linear decision boundaries. A kernel defines a similarity measure between data points. The most common are:\n", 52 | "\n", 53 | "- **linear** will give linear decision frontiers. It is the most computationally efficient approach and the one that requires the least amount of data.\n", 54 | "\n", 55 | "- **poly** will give decision frontiers that are polynomial. The order of this polynomial is given by the 'order' argument.\n", 56 | "\n", 57 | "- **rbf** uses 'radial basis functions' centered at each support vector to assemble a decision frontier. The size of the RBFs ultimately controls the smoothness of the decision frontier. RBFs are the most flexible approach, but also the one that will require the largest amount of data.\n", 58 | "\n", 59 | "Predictions in a kernel-SVM are made using the formular\n", 60 | "\n", 61 | "$$\n", 62 | "\\hat{y} = \\text{sign}(\\alpha_0 + \\sum_{j}\\alpha_j y_j k(\\mathbf{x^{(j)}}, \\mathbf{x}))\n", 63 | "$$\n", 64 | "\n", 65 | "where $\\mathbf{x}^{(j)}$ are training samples, $\\mathbf{y}^{(j)}$ the corresponding labels, $\\mathbf{x}$ is a test-sample to predict on, $k$ is the kernel, and $\\alpha$ are learned parameters.\n", 66 | "\n", 67 | "What this says is \"if $\\mathbf{x}$ is similar to $\\mathbf{x}^{(j)}$ then they probably have the same label\", where the importance of each $\\mathbf{x}^{(j)}$ for this decision is learned. [Or something much less intuitive about an infinite dimensional Hilbert-space]\n", 68 | "\n", 69 | "Often only few samples have non-zero $\\alpha$, these are called the \"support vectors\" from which SVMs get their name.\n", 70 | "These are the most discriminant samples.\n", 71 | "\n", 72 | "The most important parameter of the SVM is the regularization parameter $C$, which bounds the influence of each individual sample:\n", 73 | "\n", 74 | "- Low C values: many support vectors... Decision frontier = mean(class A) - mean(class B)\n", 75 | "- High C values: small number of support vectors: Decision frontier fully driven by most discriminant samples\n", 76 | "\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "The other important parameters are those of the kernel. Let's look at the RBF kernel in more detail:\n", 84 | "\n", 85 | "$$k(\\mathbf{x}, \\mathbf{x'}) = \\exp(-\\gamma ||\\mathbf{x} - \\mathbf{x'}||^2)$$" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "from sklearn.metrics.pairwise import rbf_kernel\n", 97 | "line = np.linspace(-3, 3, 100)[:, np.newaxis]\n", 98 | "kernel_value = rbf_kernel(line, [[0]], gamma=1)\n", 99 | "plt.plot(line, kernel_value)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "The rbf kernel has an inverse bandwidth-parameter gamma, where large gamma mean a very localized influence for each data point, and\n", 107 | "small values mean a very global influence.\n", 108 | "Let's see these two parameters in action:" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "from figures import plot_svm_interactive\n", 120 | "plot_svm_interactive()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Exercise: tune an SVM on the digits dataset" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "from sklearn import datasets\n", 139 | "digits = datasets.load_digits()\n", 140 | "X, y = digits.data, digits.target\n", 141 | "# split the dataset, apply grid-search" 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 2", 148 | "language": "python", 149 | "name": "python2" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 2 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython2", 161 | "version": "2.7.9" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 0 166 | } 167 | -------------------------------------------------------------------------------- /notebooks/05.3 In Depth - Trees and Forests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Estimators In Depth: Trees and Forests" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import numpy as np\n", 20 | "import matplotlib.pyplot as plt" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Here we'll explore a class of algorithms based on decision trees.\n", 28 | "Decision trees at their root are extremely intuitive. They\n", 29 | "encode a series of \"if\" and \"else\" choices, similar to how a person might make a decision.\n", 30 | "However, which questions to ask, and how to proceed for each answer is entirely learned from the data.\n", 31 | "\n", 32 | "For example, if you wanted to create a guide to identifying an animal found in nature, you\n", 33 | "might ask the following series of questions:\n", 34 | "\n", 35 | "- Is the animal bigger or smaller than a meter long?\n", 36 | " + *bigger*: does the animal have horns?\n", 37 | " - *yes*: are the horns longer than ten centimeters?\n", 38 | " - *no*: is the animal wearing a collar\n", 39 | " + *smaller*: does the animal have two or four legs?\n", 40 | " - *two*: does the animal have wings?\n", 41 | " - *four*: does the animal have a bushy tail?\n", 42 | "\n", 43 | "and so on. This binary splitting of questions is the essence of a decision tree." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "One of the main benefit of tree-based models is that they require little preprocessing of the data.\n", 51 | "They can work with variables of different types (continuous and discrete) and are invariant to scaling of the features.\n", 52 | "\n", 53 | "Another benefit is that tree-based models are what is called \"non-parametric\", which means they don't have a fix set of parameters to learn. Instead, a tree model can become more and more flexible, if given more data.\n", 54 | "In other words, the number of free parameters grows with the number of samples and is not fixed, as for example in linear models.\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Decision Tree Regression" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "A decision tree is a simple binary classification tree that is\n", 69 | "similar to nearest neighbor classification. It can be used as follows:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from figures import make_dataset\n", 81 | "x, y = make_dataset()\n", 82 | "X = x.reshape(-1, 1)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "from sklearn.tree import DecisionTreeRegressor\n", 94 | "\n", 95 | "reg = DecisionTreeRegressor(max_depth=5)\n", 96 | "reg.fit(X, y)\n", 97 | "\n", 98 | "X_fit = np.linspace(-3, 3, 1000).reshape((-1, 1))\n", 99 | "y_fit_1 = reg.predict(X_fit)\n", 100 | "\n", 101 | "plt.plot(X_fit.ravel(), y_fit_1, color='blue', label=\"prediction\")\n", 102 | "plt.plot(X.ravel(), y, '.k', label=\"training data\")\n", 103 | "plt.legend(loc=\"best\")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "A single decision tree allows us to estimate the signal in a non-parametric way,\n", 111 | "but clearly has some issues. In some regions, the model shows high bias and\n", 112 | "under-fits the data.\n", 113 | "(seen in the long flat lines which don't follow the contours of the data),\n", 114 | "while in other regions the model shows high variance and over-fits the data\n", 115 | "(reflected in the narrow spikes which are influenced by noise in single points)." 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Decision Tree Classification\n", 123 | "==================\n", 124 | "Decision tree classification work very similarly, by assigning all points within a leaf the majority class in that leaf:\n" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "from sklearn.datasets import make_blobs\n", 136 | "from sklearn.cross_validation import train_test_split\n", 137 | "from sklearn.tree import DecisionTreeClassifier\n", 138 | "from figures import plot_2d_separator\n", 139 | "\n", 140 | "\n", 141 | "X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=100)\n", 142 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 143 | "\n", 144 | "clf = DecisionTreeClassifier(max_depth=5)\n", 145 | "clf.fit(X_train, y_train)\n", 146 | "\n", 147 | "\n", 148 | "plot_2d_separator(clf, X, fill=True)\n", 149 | "plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=60, alpha=.7)\n", 150 | "plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=60)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "There are many parameter that control the complexity of a tree, but the one that might be easiest to understand is the maximum depth. This limits how finely the tree can partition the input space, or how many \"if-else\" questions can be asked before deciding which class a sample lies in.\n", 158 | "\n", 159 | "This parameter is important to tune for trees and tree-based models. The interactive plot below shows how underfit and overfit looks like for this model. Having a ``max_depth`` of one is clearly an underfit model, while a depth of seven or eight clearly overfits. The maximum depth a tree can be grown at for this dataset is 8, at which point each leave only contains samples from a single class. This is known as all leaves being \"pure\"." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "from figures import plot_tree_interactive\n", 171 | "plot_tree_interactive()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "Decision trees are fast to train, easy to understand, and often lead to interpretable models. However, single trees often tend to overfit the training data. Playing with the slider above you might notice that the model starts to overfit even before it has a good separation between the classes.\n", 179 | "\n", 180 | "Therefore, in practice it is more common to combine multiple trees to produce models that generalize better. The most common methods for combining trees are random forests and gradient boosted trees.\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Random Forests" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Random forests are simply many trees, built on different random subsets of the data, and using different random subsets of the features for each split.\n", 195 | "This makes the trees different from each other, and makes them overfit to different aspects. Then, their predictions are averaged, leading to a smoother estimate that overfits less.\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "from figures import plot_forest_interactive\n", 207 | "plot_forest_interactive()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Selecting the Optimal Estimator via Cross-Validation" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "from sklearn import grid_search\n", 226 | "from sklearn.datasets import load_digits\n", 227 | "from sklearn.ensemble import RandomForestClassifier\n", 228 | "\n", 229 | "digits = load_digits()\n", 230 | "X, y = digits.data, digits.target\n", 231 | "\n", 232 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 233 | "\n", 234 | "rf = RandomForestClassifier(n_estimators=200)\n", 235 | "parameters = {'max_features':['sqrt', 'log2', 10],\n", 236 | " 'max_depth':[5, 7, 9]}\n", 237 | "\n", 238 | "clf_grid = grid_search.GridSearchCV(rf, parameters, n_jobs=-1)\n", 239 | "clf_grid.fit(X_train, y_train)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "clf_grid.score(X_train, y_train)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "clf_grid.score(X_test, y_test)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "## Another option: Gradient Boosting" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "Another Ensemble method that can be useful is *Boosting*: here, rather than\n", 276 | "looking at 200 (say) parallel estimators, We construct a chain of 200 estimators\n", 277 | "which iteratively refine the results of the previous estimator.\n", 278 | "The idea is that by sequentially applying very fast, simple models, we can get a\n", 279 | "total model error which is better than any of the individual pieces." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": false 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "from sklearn.ensemble import GradientBoostingRegressor\n", 291 | "clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=.2)\n", 292 | "clf.fit(X_train, y_train)\n", 293 | "\n", 294 | "print(clf.score(X_train, y_train))\n", 295 | "print(clf.score(X_test, y_test))" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "## Exercise: Cross-validating Gradient Boosting" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Use a grid search to optimize the learning rate and max_depth for a Gradient Boosted\n", 310 | "Decision tree." 311 | ] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "Python 2", 317 | "language": "python", 318 | "name": "python2" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": { 322 | "name": "ipython", 323 | "version": 2 324 | }, 325 | "file_extension": ".py", 326 | "mimetype": "text/x-python", 327 | "name": "python", 328 | "nbconvert_exporter": "python", 329 | "pygments_lexer": "ipython2", 330 | "version": "2.7.9" 331 | } 332 | }, 333 | "nbformat": 4, 334 | "nbformat_minor": 0 335 | } 336 | -------------------------------------------------------------------------------- /notebooks/06.1 Pipelining Estimators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Pipelining estimators" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "In this section we study how different estimators maybe be chained." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## A simple example: feature extraction and selection before an estimator" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Feature extraction: vectorizer" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "For some types of data, for instance text data, a feature extraction step must be applied to convert it to numerical features.\n", 49 | "To illustrate we load the SMS spam dataset we used earlier." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import os\n", 61 | "with open(os.path.join(\"datasets\", \"smsspam\", \"SMSSpamCollection\")) as f:\n", 62 | " lines = [line.strip().split(\"\\t\") for line in f.readlines()]\n", 63 | "text = [x[1] for x in lines]\n", 64 | "y = [x[0] == \"ham\" for x in lines]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "from sklearn.cross_validation import train_test_split\n", 76 | "text_train, text_test, y_train, y_test = train_test_split(text, y)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Previously, we applied the feature extraction manually, like so:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 95 | "from sklearn.linear_model import LogisticRegression\n", 96 | "\n", 97 | "vectorizer = TfidfVectorizer()\n", 98 | "vectorizer.fit(text_train)\n", 99 | "\n", 100 | "X_train = vectorizer.transform(text_train)\n", 101 | "X_test = vectorizer.transform(text_test)\n", 102 | "\n", 103 | "clf = LogisticRegression()\n", 104 | "clf.fit(X_train, y_train)\n", 105 | "\n", 106 | "clf.score(X_test, y_test)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "The situation where we learn a transformation and then apply it to the test data is very common in machine learning.\n", 114 | "Therefore scikit-learn has a shortcut for this, called pipelines:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from sklearn.pipeline import make_pipeline\n", 126 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n", 127 | "pipeline.fit(text_train, y_train)\n", 128 | "pipeline.score(text_test, y_test)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "As you can see, this makes the code much shorter and easier to handle. Behind the scenes, exactly the same as above is happening. When calling fit on the pipeline, it will call fit on each step in turn.\n", 136 | "After the first step is fit, it will use the ``transform`` method of the first step to create a new representation.\n", 137 | "This will then be fed to the ``fit`` of the next step, and so on.\n", 138 | "Finally, on the last step, only ``fit`` is called.\n", 139 | "\n", 140 | "If we call ``score``, only ``transform`` will be called on each step - this could be the test set after all! Then, on the last step, ``score`` is called with the new representation. The same goes for ``predict``." 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Building pipelines not only simplifies the code, it is also important for model selection.\n", 148 | "Say we want to grid-search C to tune our Logistic Regression above.\n", 149 | "\n", 150 | "Let's say we do it like this:" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# this illustrates a common mistake. Don't use this code!\n", 162 | "from sklearn.grid_search import GridSearchCV\n", 163 | "\n", 164 | "vectorizer = TfidfVectorizer()\n", 165 | "vectorizer.fit(text_train)\n", 166 | "\n", 167 | "X_train = vectorizer.transform(text_train)\n", 168 | "X_test = vectorizer.transform(text_test)\n", 169 | "\n", 170 | "clf = LogisticRegression()\n", 171 | "grid = GridSearchCV(clf, param_grid={'C': [.1, 1, 10, 100]}, cv=5)\n", 172 | "grid.fit(X_train, y_train)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### What did we do wrong?" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Here, we did grid-search with cross-validation on ``X_train``. However, when applying ``TfidfVectorizer``, it saw all of the ``X_train``,\n", 187 | "not only the training folds! So it could use knowledge of the frequency of the words in the test-folds. This is called \"contamination\" of the test set, and leads to too optimistic estimates of generalization performance, or badly selected parameters.\n", 188 | "We can fix this with the pipeline, though:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "from sklearn.grid_search import GridSearchCV\n", 200 | "\n", 201 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n", 202 | "\n", 203 | "grid = GridSearchCV(pipeline,\n", 204 | " param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5)\n", 205 | "grid.fit(text_train, y_train)\n", 206 | "grid.score(text_test, y_test)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "Note that we need to tell the pipeline where at which step we wanted to set the parameter ``C``.\n", 214 | "We can do this using the special ``__`` syntax. The name before the ``__`` is simply the name of the class, the part after ``__`` is the parameter we want to set with grid-search." 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Another benefit of using pipelines is that we can now also search over parameters of the feature extraction with ``GridSearchCV``:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "from sklearn.grid_search import GridSearchCV\n", 240 | "\n", 241 | "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())\n", 242 | "\n", 243 | "params = {'logisticregression__C': [.1, 1, 10, 100],\n", 244 | " \"tfidfvectorizer__ngram_range\": [(1, 1), (1, 2), (2, 2)]}\n", 245 | "grid = GridSearchCV(pipeline, param_grid=params, cv=5)\n", 246 | "grid.fit(text_train, y_train)\n", 247 | "print(grid.best_params_)\n", 248 | "grid.score(text_test, y_test)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## Exercise" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "On the 'labeled faces in the wild' (datasets.fetch_lfw_people) chain a randomized PCA with an SVC for prediction" 263 | ] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "Python 2", 269 | "language": "python", 270 | "name": "python2" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 2 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython2", 282 | "version": "2.7.6" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 0 287 | } 288 | -------------------------------------------------------------------------------- /notebooks/datasets/smsspam/readme: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/datasets/smsspam/readme -------------------------------------------------------------------------------- /notebooks/figures/ML_flow_chart.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tutorial Diagrams 3 | ----------------- 4 | 5 | This script plots the flow-charts used in the scikit-learn tutorials. 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib.patches import Circle, Rectangle, Polygon, FancyArrow 10 | 11 | 12 | def create_base(box_bg='#CCCCCC', 13 | arrow1='#88CCFF', 14 | arrow2='#88FF88', 15 | supervised=True): 16 | plt.figure(figsize=(9, 6), facecolor='w') 17 | ax = plt.axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False) 18 | ax.set_xlim(0, 9) 19 | ax.set_ylim(0, 6) 20 | 21 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg), 22 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg), 23 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg), 24 | 25 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg), 26 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg), 27 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg), 28 | 29 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg), 30 | 31 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg), 32 | 33 | Circle((5.5, 3.5), 1.0, fc=box_bg), 34 | 35 | Polygon([[5.5, 1.7], 36 | [6.1, 1.1], 37 | [5.5, 0.5], 38 | [4.9, 1.1]], fc=box_bg), 39 | 40 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1, 41 | width=0.25, head_width=0.5, head_length=0.2), 42 | 43 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1, 44 | width=0.25, head_width=0.5, head_length=0.2), 45 | 46 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1, 47 | width=0.25, head_width=0.5, head_length=0.2), 48 | 49 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2, 50 | width=0.25, head_width=0.5, head_length=0.2), 51 | 52 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2, 53 | width=0.25, head_width=0.5, head_length=0.2), 54 | 55 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2, 56 | width=0.25, head_width=0.5, head_length=0.2)] 57 | 58 | if supervised: 59 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg), 60 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg), 61 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg), 62 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1, 63 | width=0.25, head_width=0.5, head_length=0.2), 64 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)] 65 | else: 66 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)] 67 | 68 | for p in patches: 69 | ax.add_patch(p) 70 | 71 | plt.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.", 72 | ha='center', va='center', fontsize=14) 73 | 74 | plt.text(3.6, 4.9, "Feature\nVectors", 75 | ha='left', va='center', fontsize=14) 76 | 77 | plt.text(5.5, 3.5, "Machine\nLearning\nAlgorithm", 78 | ha='center', va='center', fontsize=14) 79 | 80 | plt.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.", 81 | ha='center', va='center', fontsize=14) 82 | 83 | plt.text(3.3, 1.7, "Feature\nVector", 84 | ha='left', va='center', fontsize=14) 85 | 86 | plt.text(5.5, 1.1, "Predictive\nModel", 87 | ha='center', va='center', fontsize=12) 88 | 89 | if supervised: 90 | plt.text(1.45, 3.05, "Labels", 91 | ha='center', va='center', fontsize=14) 92 | 93 | plt.text(8.05, 1.1, "Expected\nLabel", 94 | ha='center', va='center', fontsize=14) 95 | plt.text(8.8, 5.8, "Supervised Learning Model", 96 | ha='right', va='top', fontsize=18) 97 | 98 | else: 99 | plt.text(8.05, 1.1, 100 | "Likelihood\nor Cluster ID\nor Better\nRepresentation", 101 | ha='center', va='center', fontsize=12) 102 | plt.text(8.8, 5.8, "Unsupervised Learning Model", 103 | ha='right', va='top', fontsize=18) 104 | 105 | 106 | def plot_supervised_chart(annotate=False): 107 | create_base(supervised=True) 108 | if annotate: 109 | fontdict = dict(color='r', weight='bold', size=14) 110 | plt.text(1.9, 4.55, 'X = vec.fit_transform(input)', 111 | fontdict=fontdict, 112 | rotation=20, ha='left', va='bottom') 113 | plt.text(3.7, 3.2, 'clf.fit(X, y)', 114 | fontdict=fontdict, 115 | rotation=20, ha='left', va='bottom') 116 | plt.text(1.7, 1.5, 'X_new = vec.transform(input)', 117 | fontdict=fontdict, 118 | rotation=20, ha='left', va='bottom') 119 | plt.text(6.1, 1.5, 'y_new = clf.predict(X_new)', 120 | fontdict=fontdict, 121 | rotation=20, ha='left', va='bottom') 122 | 123 | 124 | def plot_unsupervised_chart(): 125 | create_base(supervised=False) 126 | 127 | 128 | if __name__ == '__main__': 129 | plot_supervised_chart(False) 130 | plot_supervised_chart(True) 131 | plot_unsupervised_chart() 132 | plt.show() 133 | -------------------------------------------------------------------------------- /notebooks/figures/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot_2d_separator import plot_2d_separator 2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \ 3 | plot_regression_datasets, make_dataset 4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 5 | from .plot_interactive_tree import plot_tree_interactive 6 | from .plot_interactive_forest import plot_forest_interactive 7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters 8 | from .plot_rbf_svm_parameters import plot_svm_interactive 9 | 10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization', 11 | 'plot_linear_svc_regularization', 'plot_tree_interactive', 12 | 'plot_regression_datasets', 'make_dataset', 13 | "plot_forest_interactive", "plot_rbf_svm_parameters", 14 | "plot_svm_interactive"] 15 | -------------------------------------------------------------------------------- /notebooks/figures/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/cluster_comparison.png -------------------------------------------------------------------------------- /notebooks/figures/iris_setosa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/iris_setosa.jpg -------------------------------------------------------------------------------- /notebooks/figures/iris_versicolor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/iris_versicolor.jpg -------------------------------------------------------------------------------- /notebooks/figures/iris_virginica.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/iris_virginica.jpg -------------------------------------------------------------------------------- /notebooks/figures/petal_sepal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/petal_sepal.jpg -------------------------------------------------------------------------------- /notebooks/figures/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None): 6 | if eps is None: 7 | eps = X.std() / 2. 8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 10 | xx = np.linspace(x_min, x_max, 100) 11 | yy = np.linspace(y_min, y_max, 100) 12 | 13 | X1, X2 = np.meshgrid(xx, yy) 14 | X_grid = np.c_[X1.ravel(), X2.ravel()] 15 | try: 16 | decision_values = classifier.decision_function(X_grid) 17 | levels = [0] 18 | fill_levels = [decision_values.min(), 0, decision_values.max()] 19 | except AttributeError: 20 | # no decision_function 21 | decision_values = classifier.predict_proba(X_grid)[:, 1] 22 | levels = [.5] 23 | fill_levels = [0, .5, 1] 24 | 25 | if ax is None: 26 | ax = plt.gca() 27 | if fill: 28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 29 | levels=fill_levels, colors=['blue', 'red']) 30 | else: 31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 32 | colors="black") 33 | ax.set_xlim(x_min, x_max) 34 | ax.set_ylim(y_min, y_max) 35 | ax.set_xticks(()) 36 | ax.set_yticks(()) 37 | 38 | 39 | if __name__ == '__main__': 40 | from sklearn.datasets import make_blobs 41 | from sklearn.linear_model import LogisticRegression 42 | X, y = make_blobs(centers=2, random_state=42) 43 | clf = LogisticRegression().fit(X, y) 44 | plot_2d_separator(clf, X, fill=True) 45 | plt.scatter(X[:, 0], X[:, 1], c=y) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /notebooks/figures/plot_digits_datasets.py: -------------------------------------------------------------------------------- 1 | # Taken from example in scikit-learn examples 2 | # Authors: Fabian Pedregosa 3 | # Olivier Grisel 4 | # Mathieu Blondel 5 | # Gael Varoquaux 6 | # License: BSD 3 clause (C) INRIA 2011 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from matplotlib import offsetbox 11 | from sklearn import (manifold, datasets, decomposition, ensemble, lda, 12 | random_projection) 13 | 14 | def digits_plot(): 15 | digits = datasets.load_digits(n_class=6) 16 | n_digits = 500 17 | X = digits.data[:n_digits] 18 | y = digits.target[:n_digits] 19 | n_samples, n_features = X.shape 20 | n_neighbors = 30 21 | 22 | def plot_embedding(X, title=None): 23 | x_min, x_max = np.min(X, 0), np.max(X, 0) 24 | X = (X - x_min) / (x_max - x_min) 25 | 26 | plt.figure() 27 | ax = plt.subplot(111) 28 | for i in range(X.shape[0]): 29 | plt.text(X[i, 0], X[i, 1], str(digits.target[i]), 30 | color=plt.cm.Set1(y[i] / 10.), 31 | fontdict={'weight': 'bold', 'size': 9}) 32 | 33 | if hasattr(offsetbox, 'AnnotationBbox'): 34 | # only print thumbnails with matplotlib > 1.0 35 | shown_images = np.array([[1., 1.]]) # just something big 36 | for i in range(X.shape[0]): 37 | dist = np.sum((X[i] - shown_images) ** 2, 1) 38 | if np.min(dist) < 1e5: 39 | # don't show points that are too close 40 | # set a high threshold to basically turn this off 41 | continue 42 | shown_images = np.r_[shown_images, [X[i]]] 43 | imagebox = offsetbox.AnnotationBbox( 44 | offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), 45 | X[i]) 46 | ax.add_artist(imagebox) 47 | plt.xticks([]), plt.yticks([]) 48 | if title is not None: 49 | plt.title(title) 50 | 51 | n_img_per_row = 10 52 | img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row)) 53 | for i in range(n_img_per_row): 54 | ix = 10 * i + 1 55 | for j in range(n_img_per_row): 56 | iy = 10 * j + 1 57 | img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8)) 58 | 59 | plt.imshow(img, cmap=plt.cm.binary) 60 | plt.xticks([]) 61 | plt.yticks([]) 62 | plt.title('A selection from the 64-dimensional digits dataset') 63 | print("Computing PCA projection") 64 | pca = decomposition.PCA(n_components=2).fit(X) 65 | X_pca = pca.transform(X) 66 | plot_embedding(X_pca, "Principal Components projection of the digits") 67 | plt.figure() 68 | plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray") 69 | plt.axis('off') 70 | plt.figure() 71 | plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray") 72 | plt.axis('off') 73 | plt.show() 74 | -------------------------------------------------------------------------------- /notebooks/figures/plot_interactive_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | 8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 9 | 10 | 11 | def plot_forest(max_depth=1): 12 | plt.figure() 13 | ax = plt.gca() 14 | h = 0.02 15 | 16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 19 | 20 | if max_depth != 0: 21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth, 22 | random_state=1).fit(X, y) 23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 24 | Z = Z.reshape(xx.shape) 25 | ax.contourf(xx, yy, Z, alpha=.4) 26 | ax.set_title("max_depth = %d" % max_depth) 27 | else: 28 | ax.set_title("data set") 29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 30 | ax.set_xlim(x_min, x_max) 31 | ax.set_ylim(y_min, y_max) 32 | ax.set_xticks(()) 33 | ax.set_yticks(()) 34 | 35 | 36 | def plot_forest_interactive(): 37 | from IPython.html.widgets import interactive, IntSlider 38 | slider = IntSlider(min=0, max=8, step=1, value=0) 39 | return interactive(plot_forest, max_depth=slider) 40 | -------------------------------------------------------------------------------- /notebooks/figures/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from sklearn.externals.six import StringIO # doctest: +SKIP 8 | from sklearn.tree import export_graphviz 9 | from scipy.misc import imread 10 | from scipy import ndimage 11 | 12 | import re 13 | 14 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 15 | 16 | 17 | def tree_image(tree, fout=None): 18 | try: 19 | import pydot 20 | except ImportError: 21 | # make a hacky white plot 22 | x = np.ones((10, 10)) 23 | x[0, 0] = 0 24 | return x 25 | dot_data = StringIO() 26 | export_graphviz(tree, out_file=dot_data) 27 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue()) 28 | data = re.sub(r"samples = [0-9]+\\n", "", data) 29 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 30 | 31 | graph = pydot.graph_from_dot_data(data) 32 | if fout is None: 33 | fout = "tmp.png" 34 | graph.write_png(fout) 35 | return imread(fout) 36 | 37 | 38 | def plot_tree(max_depth=1): 39 | fig, ax = plt.subplots(1, 2, figsize=(15, 7)) 40 | h = 0.02 41 | 42 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 43 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 44 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 45 | 46 | if max_depth != 0: 47 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y) 48 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 49 | Z = Z.reshape(xx.shape) 50 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 51 | faces = faces.reshape(xx.shape) 52 | border = ndimage.laplace(faces) != 0 53 | ax[0].contourf(xx, yy, Z, alpha=.4) 54 | ax[0].scatter(xx[border], yy[border], marker='.', s=1) 55 | ax[0].set_title("max_depth = %d" % max_depth) 56 | ax[1].imshow(tree_image(tree)) 57 | ax[1].axis("off") 58 | else: 59 | ax[0].set_title("data set") 60 | ax[1].set_visible(False) 61 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 62 | ax[0].set_xlim(x_min, x_max) 63 | ax[0].set_ylim(y_min, y_max) 64 | ax[0].set_xticks(()) 65 | ax[0].set_yticks(()) 66 | 67 | 68 | def plot_tree_interactive(): 69 | from IPython.html.widgets import interactive, IntSlider 70 | slider = IntSlider(min=0, max=8, step=1, value=0) 71 | return interactive(plot_tree, max_depth=slider) 72 | -------------------------------------------------------------------------------- /notebooks/figures/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | def make_dataset(n_samples=100): 8 | rnd = np.random.RandomState(42) 9 | x = np.linspace(-3, 3, n_samples) 10 | y_no_noise = np.sin(4 * x) + x 11 | y = y_no_noise + rnd.normal(size=len(x)) 12 | return x, y 13 | 14 | 15 | def plot_regression_datasets(): 16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 17 | for n_samples, ax in zip([10, 100, 1000], axes): 18 | x, y = make_dataset(n_samples) 19 | ax.plot(x, y, 'o', alpha=.6) 20 | 21 | 22 | def plot_kneighbors_regularization(): 23 | rnd = np.random.RandomState(42) 24 | x = np.linspace(-3, 3, 100) 25 | y_no_noise = np.sin(4 * x) + x 26 | y = y_no_noise + rnd.normal(size=len(x)) 27 | X = x[:, np.newaxis] 28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 29 | 30 | x_test = np.linspace(-3, 3, 1000) 31 | 32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 34 | kneighbor_regression.fit(X, y) 35 | ax.plot(x, y_no_noise, label="true function") 36 | ax.plot(x, y, "o", label="data") 37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 38 | label="prediction") 39 | ax.legend() 40 | ax.set_title("n_neighbors = %d" % n_neighbors) 41 | 42 | if __name__ == "__main__": 43 | plot_kneighbors_regularization() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /notebooks/figures/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def plot_linear_svc_regularization(): 9 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 10 | # a carefully hand-designed dataset lol 11 | y[7] = 0 12 | y[27] = 0 13 | 14 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 15 | 16 | for ax, C in zip(axes, [1e-2, 1, 1e2]): 17 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 18 | 19 | svm = SVC(kernel='linear', C=C).fit(X, y) 20 | plot_2d_separator(svm, X, ax=ax, eps=.5) 21 | ax.set_title("C = %f" % C) 22 | -------------------------------------------------------------------------------- /notebooks/figures/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def make_handcrafted_dataset(): 9 | # a carefully hand-designed dataset lol 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | y[np.array([7, 27])] = 0 12 | mask = np.ones(len(X), dtype=np.bool) 13 | mask[np.array([0, 1, 5, 26])] = 0 14 | X, y = X[mask], y[mask] 15 | return X, y 16 | 17 | 18 | def plot_rbf_svm_parameters(): 19 | X, y = make_handcrafted_dataset() 20 | 21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 22 | for ax, C in zip(axes, [1e0, 5, 10, 100]): 23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 24 | 25 | svm = SVC(kernel='rbf', C=C).fit(X, y) 26 | plot_2d_separator(svm, X, ax=ax, eps=.5) 27 | ax.set_title("C = %f" % C) 28 | 29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3)) 30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]): 31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y) 33 | plot_2d_separator(svm, X, ax=ax, eps=.5) 34 | ax.set_title("gamma = %f" % gamma) 35 | 36 | 37 | def plot_svm(log_C, log_gamma): 38 | X, y = make_handcrafted_dataset() 39 | C = 10. ** log_C 40 | gamma = 10. ** log_gamma 41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 42 | ax = plt.gca() 43 | plot_2d_separator(svm, X, ax=ax, eps=.5) 44 | # plot data 45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 46 | # plot support vectors 47 | sv = svm.support_vectors_ 48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3) 49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 50 | 51 | 52 | def plot_svm_interactive(): 53 | from IPython.html.widgets import interactive, FloatSlider 54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 57 | -------------------------------------------------------------------------------- /notebooks/figures/randomized_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/randomized_search.png -------------------------------------------------------------------------------- /notebooks/figures/supervised_scikit_learn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/figures/supervised_scikit_learn.png -------------------------------------------------------------------------------- /notebooks/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | import os 4 | from sklearn.cross_validation import StratifiedShuffleSplit 5 | from sklearn.feature_extraction import DictVectorizer 6 | 7 | # Can also use pandas! 8 | def process_titanic_line(line): 9 | # Split line on "," to get fields without comma confusion 10 | vals = line.strip().split('",') 11 | # replace spurious " characters 12 | vals = [v.replace('"', '') for v in vals] 13 | pclass = int(vals[0]) 14 | survived = int(vals[1]) 15 | name = str(vals[2]) 16 | sex = str(vals[3]) 17 | try: 18 | age = float(vals[4]) 19 | except ValueError: 20 | # Blank age 21 | age = -1 22 | sibsp = float(vals[5]) 23 | parch = int(vals[6]) 24 | ticket = str(vals[7]) 25 | try: 26 | fare = float(vals[8]) 27 | except ValueError: 28 | # Blank fare 29 | fare = -1 30 | cabin = str(vals[9]) 31 | embarked = str(vals[10]) 32 | boat = str(vals[11]) 33 | homedest = str(vals[12]) 34 | line_dict = {'pclass': pclass, 'survived': survived, 'name': name, 'sex': sex, 'age': age, 'sibsp': sibsp, 35 | 'parch': parch, 'ticket': ticket, 'fare': fare, 'cabin': cabin, 'embarked': embarked, 36 | 'boat': boat, 'homedest': homedest} 37 | return line_dict 38 | 39 | 40 | def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999): 41 | f = open(os.path.join('datasets', 'titanic', 'titanic3.csv')) 42 | # Remove . from home.dest, split on quotes because some fields have commas 43 | keys = f.readline().strip().replace('.', '').split('","') 44 | lines = f.readlines() 45 | f.close() 46 | string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 47 | 'homedest'] 48 | string_keys = [s for s in string_keys if s not in feature_skip_tuple] 49 | numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare'] 50 | numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple] 51 | train_vectorizer_list = [] 52 | test_vectorizer_list = [] 53 | 54 | n_samples = len(lines) 55 | numeric_data = np.zeros((n_samples, len(numeric_keys))) 56 | numeric_labels = np.zeros((n_samples,), dtype=int) 57 | 58 | # Doing this twice is horribly inefficient but the file is small... 59 | for n, l in enumerate(lines): 60 | line_dict = process_titanic_line(l) 61 | strings = {k: line_dict[k] for k in string_keys} 62 | numeric_labels[n] = line_dict["survived"] 63 | 64 | sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size, 65 | random_state=12) 66 | # This is a weird way to get the indices but it works 67 | train_idx = None 68 | test_idx = None 69 | for train_idx, test_idx in sss: 70 | pass 71 | 72 | for n, l in enumerate(lines): 73 | line_dict = process_titanic_line(l) 74 | strings = {k: line_dict[k] for k in string_keys} 75 | if n in train_idx: 76 | train_vectorizer_list.append(strings) 77 | else: 78 | test_vectorizer_list.append(strings) 79 | numeric_data[n] = np.asarray([line_dict[k] 80 | for k in numeric_keys]) 81 | 82 | train_numeric = numeric_data[train_idx] 83 | test_numeric = numeric_data[test_idx] 84 | train_labels = numeric_labels[train_idx] 85 | test_labels = numeric_labels[test_idx] 86 | 87 | vec = DictVectorizer() 88 | # .toarray() due to returning a scipy sparse array 89 | train_categorical = vec.fit_transform(train_vectorizer_list).toarray() 90 | test_categorical = vec.transform(test_vectorizer_list).toarray() 91 | train_data = np.concatenate([train_numeric, train_categorical], axis=1) 92 | test_data = np.concatenate([test_numeric, test_categorical], axis=1) 93 | keys = numeric_keys + string_keys 94 | return keys, train_data, test_data, train_labels, test_labels 95 | 96 | 97 | FIELDNAMES = ('polarity', 'id', 'date', 'query', 'author', 'text') 98 | 99 | def read_sentiment_csv(csv_file, fieldnames=FIELDNAMES, max_count=None, 100 | n_partitions=1, partition_id=0): 101 | import csv # put the import inside for use in IPython.parallel 102 | def file_opener(csv_file): 103 | try: 104 | open(csv_file, 'r', encoding="latin1").close() 105 | return open(csv_file, 'r', encoding="latin1") 106 | except TypeError: 107 | # Python 2 does not have encoding arg 108 | return open(csv_file, 'rb') 109 | 110 | texts = [] 111 | targets = [] 112 | with file_opener(csv_file) as f: 113 | reader = csv.DictReader(f, fieldnames=fieldnames, 114 | delimiter=',', quotechar='"') 115 | pos_count, neg_count = 0, 0 116 | for i, d in enumerate(reader): 117 | if i % n_partitions != partition_id: 118 | # Skip entry if not in the requested partition 119 | continue 120 | 121 | if d['polarity'] == '4': 122 | if max_count and pos_count >= max_count / 2: 123 | continue 124 | pos_count += 1 125 | texts.append(d['text']) 126 | targets.append(1) 127 | 128 | elif d['polarity'] == '0': 129 | if max_count and neg_count >= max_count / 2: 130 | continue 131 | neg_count += 1 132 | texts.append(d['text']) 133 | targets.append(-1) 134 | 135 | return texts, targets 136 | -------------------------------------------------------------------------------- /notebooks/images/parallel_text_clf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/images/parallel_text_clf.png -------------------------------------------------------------------------------- /notebooks/images/parallel_text_clf_average.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/scipy_2015_sklearn_tutorial/be0a35724b9f0f0b92ea6c325ad5137176585b1c/notebooks/images/parallel_text_clf_average.png -------------------------------------------------------------------------------- /notebooks/solutions/02A_faces_plot.py: -------------------------------------------------------------------------------- 1 | faces = fetch_olivetti_faces() 2 | 3 | # set up the figure 4 | fig = plt.figure(figsize=(6, 6)) # figure size in inches 5 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) 6 | 7 | # plot the faces: 8 | for i in range(64): 9 | ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[]) 10 | ax.imshow(faces.images[i], cmap=plt.cm.bone, interpolation='nearest') 11 | -------------------------------------------------------------------------------- /notebooks/solutions/04B_houses_regression.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import GradientBoostingRegressor 2 | 3 | clf = GradientBoostingRegressor() 4 | clf.fit(X_train, y_train) 5 | 6 | predicted = clf.predict(X_test) 7 | expected = y_test 8 | 9 | plt.scatter(expected, predicted) 10 | plt.plot([0, 50], [0, 50], '--k') 11 | plt.axis('tight') 12 | plt.xlabel('True price ($1000s)') 13 | plt.ylabel('Predicted price ($1000s)') 14 | print "RMS:", np.sqrt(np.mean((predicted - expected) ** 2)) 15 | -------------------------------------------------------------------------------- /notebooks/solutions/04C_validation_exercise.py: -------------------------------------------------------------------------------- 1 | # suppress warnings from older versions of KNeighbors 2 | import warnings 3 | warnings.filterwarnings('ignore', message='kneighbors*') 4 | 5 | X = digits.data 6 | y = digits.target 7 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0) 8 | 9 | for Model in [LinearSVC, GaussianNB, KNeighborsClassifier]: 10 | clf = Model().fit(X_train, y_train) 11 | y_pred = clf.predict(X_test) 12 | print Model.__name__, metrics.f1_score(y_test, y_pred) 13 | 14 | print '------------------' 15 | 16 | # test SVC loss 17 | for loss in ['l1', 'l2']: 18 | clf = LinearSVC(loss=loss).fit(X_train, y_train) 19 | y_pred = clf.predict(X_test) 20 | print "LinearSVC(loss='{0}')".format(loss), metrics.f1_score(y_test, y_pred) 21 | 22 | print '-------------------' 23 | 24 | # test K-neighbors 25 | for n_neighbors in range(1, 11): 26 | clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train, y_train) 27 | y_pred = clf.predict(X_test) 28 | print "KNeighbors(n_neighbors={0})".format(n_neighbors), metrics.f1_score(y_test, y_pred) 29 | -------------------------------------------------------------------------------- /notebooks/solutions/05B_strip_headers.py: -------------------------------------------------------------------------------- 1 | def strip_headers(post): 2 | """Find the first blank line and drop the headers to keep the body""" 3 | if '\n\n' in post: 4 | headers, body = post.split('\n\n', 1) 5 | return body.lower() 6 | else: 7 | # Unexpected post inner-structure, be conservative 8 | # and keep everything 9 | return post.lower() 10 | 11 | # Let's try it on the first post. Here is the original post content, 12 | # including the headers: 13 | 14 | original_text = all_twenty_train.data[0] 15 | print("Oringinal text:") 16 | print(original_text + "\n") 17 | 18 | text_body = strip_headers(original_text) 19 | print("Stripped text:") 20 | print(text_body + "\n") 21 | 22 | # Let's train a new classifier with the header stripping preprocessor 23 | 24 | strip_vectorizer = TfidfVectorizer(preprocessor=strip_headers, min_df=2) 25 | X_train_small_stripped = strip_vectorizer.fit_transform( 26 | twenty_train_small.data) 27 | 28 | y_train_small_stripped = twenty_train_small.target 29 | 30 | classifier = MultinomialNB(alpha=0.01).fit( 31 | X_train_small_stripped, y_train_small_stripped) 32 | 33 | print("Training score: {0:.1f}%".format( 34 | classifier.score(X_train_small_stripped, y_train_small_stripped) * 100)) 35 | 36 | X_test_small_stripped = strip_vectorizer.transform(twenty_test_small.data) 37 | y_test_small_stripped = twenty_test_small.target 38 | print("Testing score: {0:.1f}%".format( 39 | classifier.score(X_test_small_stripped, y_test_small_stripped) * 100)) -------------------------------------------------------------------------------- /notebooks/solutions/06B_basic_grid_search.py: -------------------------------------------------------------------------------- 1 | for Model in [Lasso, Ridge]: 2 | scores = [cross_val_score(Model(alpha), X, y, cv=3).mean() 3 | for alpha in alphas] 4 | plt.plot(alphas, scores, label=Model.__name__) 5 | plt.legend(loc='lower left') 6 | -------------------------------------------------------------------------------- /notebooks/solutions/06B_learning_curves.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import explained_variance_score, mean_squared_error 2 | from sklearn.cross_validation import train_test_split 3 | 4 | def plot_learning_curve(model, err_func=explained_variance_score, N=300, n_runs=10, n_sizes=50, ylim=None): 5 | sizes = np.linspace(5, N, n_sizes).astype(int) 6 | train_err = np.zeros((n_runs, n_sizes)) 7 | validation_err = np.zeros((n_runs, n_sizes)) 8 | for i in range(n_runs): 9 | for j, size in enumerate(sizes): 10 | xtrain, xtest, ytrain, ytest = train_test_split( 11 | X, y, train_size=size, random_state=i) 12 | # Train on only the first `size` points 13 | model.fit(xtrain, ytrain) 14 | validation_err[i, j] = err_func(ytest, model.predict(xtest)) 15 | train_err[i, j] = err_func(ytrain, model.predict(xtrain)) 16 | 17 | plt.plot(sizes, validation_err.mean(axis=0), lw=2, label='validation') 18 | plt.plot(sizes, train_err.mean(axis=0), lw=2, label='training') 19 | 20 | plt.xlabel('traning set size') 21 | plt.ylabel(err_func.__name__.replace('_', ' ')) 22 | 23 | plt.grid(True) 24 | 25 | plt.legend(loc=0) 26 | 27 | plt.xlim(0, N-1) 28 | 29 | if ylim: 30 | plt.ylim(ylim) 31 | 32 | 33 | plt.figure(figsize=(10, 8)) 34 | for i, model in enumerate([Lasso(0.01), Ridge(0.06)]): 35 | plt.subplot(221 + i) 36 | plot_learning_curve(model, ylim=(0, 1)) 37 | plt.title(model.__class__.__name__) 38 | 39 | plt.subplot(223 + i) 40 | plot_learning_curve(model, err_func=mean_squared_error, ylim=(0, 8000)) 41 | -------------------------------------------------------------------------------- /notebooks/solutions/07B_grid_search.py: -------------------------------------------------------------------------------- 1 | np.random.seed(42) 2 | for model in [DecisionTreeRegressor(), 3 | GradientBoostingRegressor(), 4 | RandomForestRegressor()]: 5 | parameters = {'max_depth':[3, 5, 7, 9, 11]} 6 | 7 | # Warning: be sure your data is shuffled before using GridSearch! 8 | clf_grid = grid_search.GridSearchCV(model, parameters) 9 | clf_grid.fit(X, y_noisy) 10 | print '------------------------' 11 | print model.__class__.__name__ 12 | print clf_grid.best_params_ 13 | print clf_grid.best_score_ 14 | -------------------------------------------------------------------------------- /notebooks/solutions/08A_digits_projection.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | from sklearn.manifold import Isomap, LocallyLinearEmbedding 3 | 4 | plt.figure(figsize=(14, 4)) 5 | for i, est in enumerate([PCA(n_components=2, whiten=True), 6 | Isomap(n_components=2, n_neighbors=10), 7 | LocallyLinearEmbedding(n_components=2, n_neighbors=10, method='modified')]): 8 | plt.subplot(131 + i) 9 | projection = est.fit_transform(digits.data) 10 | plt.scatter(projection[:, 0], projection[:, 1], c=digits.target) 11 | plt.title(est.__class__.__name__) 12 | -------------------------------------------------------------------------------- /notebooks/solutions/08B_digits_clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans 2 | kmeans = KMeans(n_clusters=10) 3 | clusters = kmeans.fit_predict(digits.data) 4 | 5 | print(kmeans.cluster_centers_.shape) 6 | 7 | #------------------------------------------------------------ 8 | # visualize the cluster centers 9 | fig = plt.figure(figsize=(8, 3)) 10 | for i in range(10): 11 | ax = fig.add_subplot(2, 5, 1 + i) 12 | ax.imshow(kmeans.cluster_centers_[i].reshape((8, 8)), 13 | cmap=plt.cm.binary) 14 | from sklearn.manifold import Isomap 15 | X_iso = Isomap(n_neighbors=10).fit_transform(digits.data) 16 | 17 | #------------------------------------------------------------ 18 | # visualize the projected data 19 | fig, ax = plt.subplots(1, 2, figsize=(8, 4)) 20 | 21 | ax[0].scatter(X_iso[:, 0], X_iso[:, 1], c=clusters) 22 | ax[1].scatter(X_iso[:, 0], X_iso[:, 1], c=digits.target) 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # brew update && brew install gcc (this includes gfortran) 2 | ipython[all]>=3.2.0 3 | pyzmq>=14.7.0 4 | Pillow>=2.8.0 5 | numpy>=1.9.2 6 | scipy>=0.15.1 7 | scikit-learn>=0.16.1 8 | matplotlib>=1.4.3 9 | graphviz>=0.4.4 10 | pyparsing==1.5.7 11 | pydot 12 | --------------------------------------------------------------------------------