├── .gitattributes ├── .gitignore ├── 1.0 Data Loading.ipynb ├── 1.1 Datasets and Benchmark.ipynb ├── 1.2 Data Analysis.ipynb ├── 1.3 Data Wrangling.ipynb ├── 1.4 Data Grouping and Filtering.ipynb ├── 1.5 Introduction to Numpy.ipynb ├── 2.0 Introduction to Machine Learning.ipynb ├── 2.1 Data Representation for Machine Learning.ipynb ├── 2.2 Supervised Learning - Classification.ipynb ├── 2.3 Supervised Learning - Regression.ipynb ├── 2.4 Unsupervised Learning - Clustering.ipynb ├── 2.5 Training and Testing Data.ipynb ├── Extra - R and Python.ipynb ├── LICENSE ├── README.md ├── data ├── adult.tsv.gz ├── blooth_sales_data.csv ├── blooth_sales_data.json ├── blooth_sales_data_2.csv ├── customer-churn-model │ ├── Customer Churn Columns.csv │ ├── Customer Churn Model.csv │ ├── Customer Churn Model.txt │ └── Tab Customer Churn Model.txt ├── time_series.txt └── titanic │ ├── Titanic Description.txt │ ├── titanic3.csv │ ├── titanic3.xls │ ├── titanic3.xlsx │ ├── titanic_custom.csv │ ├── titanic_custom.json │ └── titanic_custom.xls ├── resources ├── boston_rtree.dot ├── imgs │ ├── bag_of_words.svg │ ├── cluster_comparison.png │ ├── cross_validation.svg │ ├── data_representation.svg │ ├── df_inside.png │ ├── df_inside_numpy.png │ ├── df_outside.png │ ├── feature_union.svg │ ├── grid_search_cross_validation.svg │ ├── hashing_vectorizer.svg │ ├── iris_setosa.jpg │ ├── iris_versicolor.jpg │ ├── iris_virginica.jpg │ ├── join-inner.png │ ├── join-left.png │ ├── join-outer.png │ ├── join-right.png │ ├── ml-wordle-436.jpg │ ├── ml_supervised_example.png │ ├── ml_unsupervised_example.png │ ├── overfitting_underfitting_cartoon.svg │ ├── petal_sepal.jpg │ ├── pipeline_cross_validation.svg │ ├── randomized_search.png │ ├── scikit-learn-cheatsheet.png │ ├── supervised_scikit_learn.png │ ├── supervised_workflow.svg │ ├── train_test_split.svg │ ├── train_validation_test2.svg │ └── unsupervised_workflow.svg ├── inner-join.png ├── iris_dtree.dot ├── left-join.png ├── outer-join.png ├── right-join.png └── summary-lm.png ├── solutions ├── 101.py ├── 102.py ├── 103.py ├── 104.py ├── 105.py ├── 106.py ├── 107.py ├── 108.py ├── 121.py └── 241_digits_clustering.py └── utils ├── ML_flow_chart.py ├── __init__.py ├── helpers.py ├── plot_2d_separator.py ├── plot_digits_datasets.py ├── plot_interactive_forest.py ├── plot_interactive_tree.py ├── plot_kneighbors_regularization.py ├── plot_linear_svc_regularization.py └── plot_rbf_svm_parameters.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # Environments 82 | .env 83 | .venv 84 | env/ 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # mkdocs documentation 96 | /site 97 | 98 | # mypy 99 | .mypy_cache/ 100 | -------------------------------------------------------------------------------- /2.0 Introduction to Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Introduction to Machine Learning in Python" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "subslide" 19 | } 20 | }, 21 | "source": [ 22 | "## What is Machine Learning?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "slideshow": { 29 | "slide_type": "subslide" 30 | } 31 | }, 32 | "source": [ 33 | "### Machine Learning at Glance" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "slideshow": { 40 | "slide_type": "-" 41 | } 42 | }, 43 | "source": [ 44 | "" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "slideshow": { 51 | "slide_type": "subslide" 52 | } 53 | }, 54 | "source": [ 55 | "> Machine learning teaches machines how to carry out tasks by themselves. It is that simple.\n", 56 | "The complexity comes with the details.\n", 57 | "\n", 58 | "_W. Richert & L.P. Coelho, 2013\n", 59 | "Building Machine Learning Systems with Python_" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "slideshow": { 66 | "slide_type": "subslide" 67 | } 68 | }, 69 | "source": [ 70 | "Machine learning is the process to automatically **extract knowledge** from data, usually with the goal of making **predictions** on _new_, _unseen_ data. \n", 71 | "\n", 72 | "A classical example is a _spam filter_, for which the user keeps labeling incoming mails as either spam or not spam. \n", 73 | "\n", 74 | "A machine learning algorithm then \"learns\" what distinguishes spam from normal emails, and can predict for new emails whether they are spam or not." 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": { 80 | "slideshow": { 81 | "slide_type": "subslide" 82 | } 83 | }, 84 | "source": [ 85 | "Central to machine learning is the concept of **making decision automatically** from data, **without the user specifying explicit rules** how this decision should be made.\n", 86 | "\n", 87 | "For the case of emails, the user doesn't provide a list of words or characteristics that make an email spam. Instead, the user provides examples of spam and non-spam emails." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "slideshow": { 94 | "slide_type": "subslide" 95 | } 96 | }, 97 | "source": [ 98 | "The second central concept is **generalization**. \n", 99 | "\n", 100 | "The goal of a machine learning algorithm is to predict on new, previously unseen data. We are not interested in marking an email as spam or not, that the human already labeled. Instead, we want to make the users life easier by making an automatic decision for new incoming mail." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "slideshow": { 107 | "slide_type": "subslide" 108 | } 109 | }, 110 | "source": [ 111 | "There are two kinds of machine learning we will talk about in these notebooks: \n", 112 | "\n", 113 | "* **Supervised learning;** \n", 114 | "* **Unsupervised learning.**" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "slideshow": { 121 | "slide_type": "slide" 122 | } 123 | }, 124 | "source": [ 125 | "### Supervised Learning\n", 126 | "\n", 127 | "In **Supervised Learning**, we have a dataset consisting of both input features and a desired output, such as in the spam / no-spam example.\n", 128 | "\n", 129 | "The task is to construct a model (or program) which is able to predict the desired output of an unseen object\n", 130 | "given the set of features." 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "slideshow": { 137 | "slide_type": "subslide" 138 | } 139 | }, 140 | "source": [ 141 | "" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "slideshow": { 148 | "slide_type": "subslide" 149 | } 150 | }, 151 | "source": [ 152 | "Supervised learning is further broken down into two categories, **classification** and **regression**.\n", 153 | "\n", 154 | "In classification, the label is discrete (a.k.a. _Categorical Data_, i.e. _Integer values_), such as \"spam\" or \"no spam\". \n", 155 | "\n", 156 | "In other words, it provides a clear-cut distinction between categories. \n", 157 | "\n", 158 | "In regression, the label is continuous, i.e. _Float output_." 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "slideshow": { 165 | "slide_type": "subslide" 166 | } 167 | }, 168 | "source": [ 169 | "### Other Examples\n", 170 | "\n", 171 | "Some more complicated examples are:\n", 172 | "\n", 173 | "- given a multicolor image of an object through a telescope, determine\n", 174 | " whether that object is a star, a quasar, or a galaxy.\n", 175 | "- given a photograph of a person, identify the person in the photo.\n", 176 | "- given a list of movies a person has watched and their personal rating\n", 177 | " of the movie, recommend a list of movies they would like.\n", 178 | "- given a persons age, education and position, infer their salary" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "slideshow": { 185 | "slide_type": "subslide" 186 | } 187 | }, 188 | "source": [ 189 | "What these tasks have in common is that there is one or more unknown\n", 190 | "quantities associated with the object which needs to be determined from other\n", 191 | "observed quantities." 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "slideshow": { 198 | "slide_type": "subslide" 199 | } 200 | }, 201 | "source": [ 202 | "### For example\n", 203 | "\n", 204 | "* In astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a **classification problem**: the label is from three distinct categories. \n", 205 | "\n", 206 | "* On the other hand, we might wish to estimate the age of an object based on such observations: this would be a **regression problem**, because the label (age) is a continuous quantity." 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "slide" 214 | } 215 | }, 216 | "source": [ 217 | "### Unsupervised Learning\n", 218 | "\n", 219 | "In **Unsupervised Learning** there is no desired output associated with the data.\n", 220 | "\n", 221 | "Instead, we are interested in extracting some form of knowledge or model from the given data.\n", 222 | "\n", 223 | "In a sense, you can think of unsupervised learning as a means of discovering labels from the data itself.\n", 224 | "\n", 225 | "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n", 226 | "*density estimation*. " 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "slideshow": { 233 | "slide_type": "subslide" 234 | } 235 | }, 236 | "source": [ 237 | "" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "slideshow": { 244 | "slide_type": "fragment" 245 | } 246 | }, 247 | "source": [ 248 | "Unsupervised learning is often harder to understand and to evaluate.\n", 249 | "\n", 250 | "Sometimes the two may even be combined: e.g. Unsupervised learning can be used to find useful\n", 251 | "features in heterogeneous data, and then these features can be used within a supervised\n", 252 | "framework." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "slideshow": { 259 | "slide_type": "subslide" 260 | } 261 | }, 262 | "source": [ 263 | "### Other Examples\n", 264 | "\n", 265 | "Some more involved unsupervised learning problems are:\n", 266 | "\n", 267 | "- given detailed observations of distant galaxies, determine which features or combinations of\n", 268 | " features summarize best the information.\n", 269 | "- given a mixture of two sound sources (for example, a person talking over some music),\n", 270 | " separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n", 271 | "- given a large collection of news articles, find recurring topics inside these articles.\n", 272 | "- given a collection of images, cluster similar images together (for example to group them when visualizing a collection)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "slideshow": { 279 | "slide_type": "slide" 280 | } 281 | }, 282 | "source": [ 283 | "# Scikit-learn at a Glance" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.6.5" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 2 315 | } 316 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 leriomaggio 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Analysis and Machine Learning using Python 2 | 3 | ## Introduction 4 | 5 | _(adapted from [Step by step approach to perform data analysis in Python](https://bigdata-madesimple.com/step-by-step-approach-to-perform-data-analysis-using-python/))_ 6 | 7 | >So you have decided to learn Python, but you don’t have prior programming experience. 8 | >So you are confused on where to start, and how much Python to learn. 9 | > 10 | > These are some of the common questions a beginner has while getting started with Python(for data centric application): 11 | 12 | > * “How long does it take to learn Python” 13 | > * “How much Python should I learn for performing data analysis” 14 | > * “What are the best books/courses to learn Python” 15 | > * “Should I be an expert Python programmer, in order to work with data sets” 16 | > 17 | >It is good to be confused, while beginning to learn a new skill, that’s what author of “learn anything in 20 hours” says. 18 | > 19 | > However the **key word** here is: **Don’t Panic**! This tutorial has been thought and designed to show you that 20 | 21 | ### What do you need to get started 22 | 23 | Most people have the misconception that for performing data analysis in Python requires to be proficient in Python 24 | programming. 25 | 26 | Coding is fun, but you don't *really* need to be a coding ninja in Python to do data analysis. 27 | 28 | What you just need to get started is some basics of (Python) programming and some very elementary software engineering 29 | concepts, just to avoid disasters when you go in production - whatever _production_ means to you (e.g. deploy a system online, or 30 | share the code of your prototype or experiments on a public repo for **reproducibility**.) 31 | 32 | ### What you won't find in this tutorial 33 | 34 | In this tutoria, you won't learn how to program in Python. 35 | If you are looking for a quick tutorial on Python programming, maybe this is the tutorial for you: 36 | [Python Programming Tutorial](https://github.com/leriomaggio/python-tutorial) 37 | 38 | ### What you will find in this tutorial 39 | 40 | For a glimpse on what to expect by this tutorial, I would suggest this `5 mins` reading: 41 | [5 amazingly powerful Python libraries for Data Science](http://bigdataexaminer.com/2015/05/18/5-amazingly-powerful-python-libraries-for-data-science/) 42 | 43 | 44 | ## Setup and Installation 45 | 46 | To run the code included in this repository, we will be using **Python 3** (which is **not** Python 2, _by the way_). 47 | Although using your Python version (and environment) will be more than fine, for an easier and quick setup of all the 48 | necessary Python packages, I would **strongly suggest** to download and use the 49 | [**Anaconda Python**](https://www.anaconda.com/distribution/) distribution. 50 | 51 | ## Jupyter Notebook Format 52 | 53 | (Most of) The materials in this tutorial will be provided as **Jupyter Notebooks**. 54 | 55 | If you don't know what a Jupyter notebook is, or how to use it, please take a look at this quick 56 | introductory tour: 57 | [IPython Notebook Beginner Guide](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/index.html). 58 | 59 | For additional details and materials on Jupyter and IPython, here there are some other suggester readings: 60 | 61 | - [Jupyter Notebook the Definitive Guide](https://www.datacamp.com/community/tutorials/tutorial-jupyter-notebook): 62 | - [What is a Jupyter Notebook](https://www.datacamp.com/community/tutorials/tutorial-jupyter-notebook#WhatIs) 63 | - [Practical Introduction](https://www.datacamp.com/community/tutorials/tutorial-jupyter-notebook##UseJupyter) 64 | - [Notebook Examples](https://www.datacamp.com/community/tutorials/tutorial-jupyter-notebook##NotebookExamples) 65 | 66 | 67 | ## Further Readings 68 | 69 | If you want an introductory overview of Python for Data Science, I strongly recommend 70 | [Scipy Lecture Notes](http://www.scipy-lectures.org): a community driven project where you can find 71 | tutorials (for non-experts) on the scientific Python ecosystems. 72 | 73 | Additional Books for further readings: 74 | 75 | - [Scipy and Numpy](http://shop.oreilly.com/product/0636920020219.do) 76 | - [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) 77 | - [Elegan Scipy](http://shop.oreilly.com/product/0636920038481.do) 78 | - [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do) 79 | - [Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do) 80 | - [Building Machine Learning Systems with Python](https://www.packtpub.com/big-data-and-business-intelligence/building-machine-learning-systems-python) 81 | 82 | ## Credits and Acknowledgements 83 | 84 | Some of the material included in this repository has been created by adapting the materils in the **Python-ML-Course** repository by **luisPinedo**. 85 | Original versions available here: [https://github.com/luisPinedo/python-ml-course]() 86 | -------------------------------------------------------------------------------- /data/adult.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/data/adult.tsv.gz -------------------------------------------------------------------------------- /data/customer-churn-model/Customer Churn Columns.csv: -------------------------------------------------------------------------------- 1 | Column_Names 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | -------------------------------------------------------------------------------- /data/titanic/Titanic Description.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/data/titanic/Titanic Description.txt -------------------------------------------------------------------------------- /data/titanic/titanic3.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/data/titanic/titanic3.xls -------------------------------------------------------------------------------- /data/titanic/titanic3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/data/titanic/titanic3.xlsx -------------------------------------------------------------------------------- /data/titanic/titanic_custom.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/data/titanic/titanic_custom.xls -------------------------------------------------------------------------------- /resources/boston_rtree.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | node [shape=box] ; 3 | 0 [label="rm <= 6.941\nmse = 84.42\nsamples = 506\nvalue = 22.533"] ; 4 | 1 [label="lstat <= 14.4\nmse = 40.273\nsamples = 430\nvalue = 19.934"] ; 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 6 | 2 [label="lstat <= 4.91\nmse = 26.009\nsamples = 255\nvalue = 23.35"] ; 7 | 1 -> 2 ; 8 | 3 [label="mse = 47.187\nsamples = 20\nvalue = 31.565"] ; 9 | 2 -> 3 ; 10 | 4 [label="lstat <= 9.715\nmse = 17.974\nsamples = 235\nvalue = 22.651"] ; 11 | 2 -> 4 ; 12 | 5 [label="age <= 87.6\nmse = 22.287\nsamples = 122\nvalue = 24.393"] ; 13 | 4 -> 5 ; 14 | 6 [label="mse = 11.111\nsamples = 112\nvalue = 23.787"] ; 15 | 5 -> 6 ; 16 | 7 [label="mse = 97.42\nsamples = 10\nvalue = 31.17"] ; 17 | 5 -> 7 ; 18 | 8 [label="ptratio <= 17.85\nmse = 6.503\nsamples = 113\nvalue = 20.77"] ; 19 | 4 -> 8 ; 20 | 9 [label="mse = 8.556\nsamples = 33\nvalue = 21.864"] ; 21 | 8 -> 9 ; 22 | 10 [label="mse = 4.96\nsamples = 80\nvalue = 20.319"] ; 23 | 8 -> 10 ; 24 | 11 [label="crim <= 6.992\nmse = 19.276\nsamples = 175\nvalue = 14.956"] ; 25 | 1 -> 11 ; 26 | 12 [label="nox <= 0.531\nmse = 11.391\nsamples = 101\nvalue = 17.138"] ; 27 | 11 -> 12 ; 28 | 13 [label="mse = 9.016\nsamples = 24\nvalue = 20.021"] ; 29 | 12 -> 13 ; 30 | 14 [label="lstat <= 18.885\nmse = 8.733\nsamples = 77\nvalue = 16.239"] ; 31 | 12 -> 14 ; 32 | 15 [label="mse = 5.952\nsamples = 53\nvalue = 17.234"] ; 33 | 14 -> 15 ; 34 | 16 [label="mse = 7.862\nsamples = 24\nvalue = 14.042"] ; 35 | 14 -> 16 ; 36 | 17 [label="nox <= 0.605\nmse = 14.674\nsamples = 74\nvalue = 11.978"] ; 37 | 11 -> 17 ; 38 | 18 [label="mse = 18.606\nsamples = 12\nvalue = 16.633"] ; 39 | 17 -> 18 ; 40 | 19 [label="lstat <= 19.645\nmse = 8.908\nsamples = 62\nvalue = 11.077"] ; 41 | 17 -> 19 ; 42 | 20 [label="mse = 4.18\nsamples = 18\nvalue = 13.922"] ; 43 | 19 -> 20 ; 44 | 21 [label="mse = 6.177\nsamples = 44\nvalue = 9.914"] ; 45 | 19 -> 21 ; 46 | 22 [label="rm <= 7.437\nmse = 79.729\nsamples = 76\nvalue = 37.238"] ; 47 | 0 -> 22 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 48 | 23 [label="lstat <= 5.495\nmse = 41.296\nsamples = 46\nvalue = 32.113"] ; 49 | 22 -> 23 ; 50 | 24 [label="mse = 17.249\nsamples = 23\nvalue = 35.248"] ; 51 | 23 -> 24 ; 52 | 25 [label="mse = 45.69\nsamples = 23\nvalue = 28.978"] ; 53 | 23 -> 25 ; 54 | 26 [label="ptratio <= 15.4\nmse = 36.628\nsamples = 30\nvalue = 45.097"] ; 55 | 22 -> 26 ; 56 | 27 [label="mse = 7.774\nsamples = 16\nvalue = 47.975"] ; 57 | 26 -> 27 ; 58 | 28 [label="mse = 49.315\nsamples = 14\nvalue = 41.807"] ; 59 | 26 -> 28 ; 60 | } -------------------------------------------------------------------------------- /resources/imgs/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/cluster_comparison.png -------------------------------------------------------------------------------- /resources/imgs/df_inside.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/df_inside.png -------------------------------------------------------------------------------- /resources/imgs/df_inside_numpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/df_inside_numpy.png -------------------------------------------------------------------------------- /resources/imgs/df_outside.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/df_outside.png -------------------------------------------------------------------------------- /resources/imgs/iris_setosa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/iris_setosa.jpg -------------------------------------------------------------------------------- /resources/imgs/iris_versicolor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/iris_versicolor.jpg -------------------------------------------------------------------------------- /resources/imgs/iris_virginica.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/iris_virginica.jpg -------------------------------------------------------------------------------- /resources/imgs/join-inner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/join-inner.png -------------------------------------------------------------------------------- /resources/imgs/join-left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/join-left.png -------------------------------------------------------------------------------- /resources/imgs/join-outer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/join-outer.png -------------------------------------------------------------------------------- /resources/imgs/join-right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/join-right.png -------------------------------------------------------------------------------- /resources/imgs/ml-wordle-436.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/ml-wordle-436.jpg -------------------------------------------------------------------------------- /resources/imgs/ml_supervised_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/ml_supervised_example.png -------------------------------------------------------------------------------- /resources/imgs/ml_unsupervised_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/ml_unsupervised_example.png -------------------------------------------------------------------------------- /resources/imgs/overfitting_underfitting_cartoon.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlModel complexity 385 | Accuracy 403 | Training 421 | Generalization 439 | Underfitting 457 | Overfitting 475 | Sweet spot 493 | -------------------------------------------------------------------------------- /resources/imgs/petal_sepal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/petal_sepal.jpg -------------------------------------------------------------------------------- /resources/imgs/randomized_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/randomized_search.png -------------------------------------------------------------------------------- /resources/imgs/scikit-learn-cheatsheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/scikit-learn-cheatsheet.png -------------------------------------------------------------------------------- /resources/imgs/supervised_scikit_learn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/imgs/supervised_scikit_learn.png -------------------------------------------------------------------------------- /resources/imgs/train_test_split.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlAll Data 369 | Training data 395 | Test data 421 | -------------------------------------------------------------------------------- /resources/imgs/train_validation_test2.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlAll Data 351 | Training 373 | Test 398 | Validation 423 | -------------------------------------------------------------------------------- /resources/imgs/unsupervised_workflow.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlTraining Data 365 | Test Data 391 | Model 413 | New View 435 | -------------------------------------------------------------------------------- /resources/inner-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/inner-join.png -------------------------------------------------------------------------------- /resources/iris_dtree.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | node [shape=box] ; 3 | 0 [label="Petal.Length <= 2.6\nentropy = 1.582\nsamples = 111\nvalue = [38, 34, 39]"] ; 4 | 1 [label="entropy = 0.0\nsamples = 38\nvalue = [38, 0, 0]"] ; 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 6 | 2 [label="Petal.Width <= 1.75\nentropy = 0.997\nsamples = 73\nvalue = [0, 34, 39]"] ; 7 | 0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 8 | 3 [label="Petal.Length <= 4.95\nentropy = 0.406\nsamples = 37\nvalue = [0, 34, 3]"] ; 9 | 2 -> 3 ; 10 | 4 [label="entropy = 0.0\nsamples = 32\nvalue = [0, 32, 0]"] ; 11 | 3 -> 4 ; 12 | 5 [label="entropy = 0.971\nsamples = 5\nvalue = [0, 2, 3]"] ; 13 | 3 -> 5 ; 14 | 6 [label="entropy = 0.0\nsamples = 36\nvalue = [0, 0, 36]"] ; 15 | 2 -> 6 ; 16 | } -------------------------------------------------------------------------------- /resources/left-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/left-join.png -------------------------------------------------------------------------------- /resources/outer-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/outer-join.png -------------------------------------------------------------------------------- /resources/right-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/right-join.png -------------------------------------------------------------------------------- /resources/summary-lm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leriomaggio/python-ml-course/3dfca6773f102a415d952c36b2f5e67b05f06363/resources/summary-lm.png -------------------------------------------------------------------------------- /solutions/101.py: -------------------------------------------------------------------------------- 1 | sales_data.head(7) 2 | -------------------------------------------------------------------------------- /solutions/102.py: -------------------------------------------------------------------------------- 1 | sales_data.tail(27) -------------------------------------------------------------------------------- /solutions/103.py: -------------------------------------------------------------------------------- 1 | sales_data.info() 2 | -------------------------------------------------------------------------------- /solutions/104.py: -------------------------------------------------------------------------------- 1 | data2 = pd.read_csv('./data/blooth_sales_data_2.csv') 2 | data2.head() -------------------------------------------------------------------------------- /solutions/105.py: -------------------------------------------------------------------------------- 1 | data2 = pd.read_csv('./data/blooth_sales_data_2.csv', sep=';', decimal=',', parse_dates=['birthday', 'orderdate']) 2 | data2.head() -------------------------------------------------------------------------------- /solutions/106.py: -------------------------------------------------------------------------------- 1 | dataj = pd.read_csv('./data/blooth_sales_data_3.csv', sep=';', decimal=',') 2 | dataj.head() -------------------------------------------------------------------------------- /solutions/107.py: -------------------------------------------------------------------------------- 1 | directive = "%m/%Y/%dT%I:%M:%S%p+0900" 2 | dataj['orderdate_parsed'] = pd.to_datetime(dataj['orderdate'], 3 | format=directive) 4 | 5 | dataj.info() -------------------------------------------------------------------------------- /solutions/108.py: -------------------------------------------------------------------------------- 1 | sales_data['orderdate_parsed'] = pd.to_datetime(sales_data['orderdate'], format="%m/%Y/%dT%I%:H%M:%S%p+0900") 2 | sales_data -------------------------------------------------------------------------------- /solutions/121.py: -------------------------------------------------------------------------------- 1 | pclass_values = data['pclass'].values 2 | unique_pclasses = set(pclass_values) -------------------------------------------------------------------------------- /solutions/241_digits_clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans 2 | kmeans = KMeans(n_clusters=10) 3 | clusters = kmeans.fit_predict(digits.data) 4 | 5 | print(kmeans.cluster_centers_.shape) 6 | 7 | #------------------------------------------------------------ 8 | # visualize the cluster centers 9 | fig = plt.figure(figsize=(8, 3)) 10 | for i in range(10): 11 | ax = fig.add_subplot(2, 5, 1 + i) 12 | ax.imshow(kmeans.cluster_centers_[i].reshape((8, 8)), 13 | cmap=plt.cm.binary) 14 | from sklearn.manifold import Isomap 15 | X_iso = Isomap(n_neighbors=10).fit_transform(digits.data) 16 | 17 | #------------------------------------------------------------ 18 | # visualize the projected data 19 | fig, ax = plt.subplots(1, 2, figsize=(8, 4)) 20 | 21 | ax[0].scatter(X_iso[:, 0], X_iso[:, 1], c=clusters) 22 | ax[1].scatter(X_iso[:, 0], X_iso[:, 1], c=digits.target) 23 | -------------------------------------------------------------------------------- /utils/ML_flow_chart.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tutorial Diagrams 3 | ----------------- 4 | 5 | This script plots the flow-charts used in the scikit-learn tutorials. 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib.patches import Circle, Rectangle, Polygon, FancyArrow 10 | 11 | 12 | def create_base(box_bg='#CCCCCC', 13 | arrow1='#88CCFF', 14 | arrow2='#88FF88', 15 | supervised=True): 16 | plt.figure(figsize=(9, 6), facecolor='w') 17 | ax = plt.axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False) 18 | ax.set_xlim(0, 9) 19 | ax.set_ylim(0, 6) 20 | 21 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg), 22 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg), 23 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg), 24 | 25 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg), 26 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg), 27 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg), 28 | 29 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg), 30 | 31 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg), 32 | 33 | Circle((5.5, 3.5), 1.0, fc=box_bg), 34 | 35 | Polygon([[5.5, 1.7], 36 | [6.1, 1.1], 37 | [5.5, 0.5], 38 | [4.9, 1.1]], fc=box_bg), 39 | 40 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1, 41 | width=0.25, head_width=0.5, head_length=0.2), 42 | 43 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1, 44 | width=0.25, head_width=0.5, head_length=0.2), 45 | 46 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1, 47 | width=0.25, head_width=0.5, head_length=0.2), 48 | 49 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2, 50 | width=0.25, head_width=0.5, head_length=0.2), 51 | 52 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2, 53 | width=0.25, head_width=0.5, head_length=0.2), 54 | 55 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2, 56 | width=0.25, head_width=0.5, head_length=0.2)] 57 | 58 | if supervised: 59 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg), 60 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg), 61 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg), 62 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1, 63 | width=0.25, head_width=0.5, head_length=0.2), 64 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)] 65 | else: 66 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)] 67 | 68 | for p in patches: 69 | ax.add_patch(p) 70 | 71 | plt.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.", 72 | ha='center', va='center', fontsize=14) 73 | 74 | plt.text(3.6, 4.9, "Feature\nVectors", 75 | ha='left', va='center', fontsize=14) 76 | 77 | plt.text(5.5, 3.5, "Machine\nLearning\nAlgorithm", 78 | ha='center', va='center', fontsize=14) 79 | 80 | plt.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.", 81 | ha='center', va='center', fontsize=14) 82 | 83 | plt.text(3.3, 1.7, "Feature\nVector", 84 | ha='left', va='center', fontsize=14) 85 | 86 | plt.text(5.5, 1.1, "Predictive\nModel", 87 | ha='center', va='center', fontsize=12) 88 | 89 | if supervised: 90 | plt.text(1.45, 3.05, "Labels", 91 | ha='center', va='center', fontsize=14) 92 | 93 | plt.text(8.05, 1.1, "Expected\nLabel", 94 | ha='center', va='center', fontsize=14) 95 | plt.text(8.8, 5.8, "Supervised Learning Model", 96 | ha='right', va='top', fontsize=18) 97 | 98 | else: 99 | plt.text(8.05, 1.1, 100 | "Likelihood\nor Cluster ID\nor Better\nRepresentation", 101 | ha='center', va='center', fontsize=12) 102 | plt.text(8.8, 5.8, "Unsupervised Learning Model", 103 | ha='right', va='top', fontsize=18) 104 | 105 | 106 | def plot_supervised_chart(annotate=False): 107 | create_base(supervised=True) 108 | if annotate: 109 | fontdict = dict(color='r', weight='bold', size=14) 110 | plt.text(1.9, 4.55, 'X = vec.fit_transform(input)', 111 | fontdict=fontdict, 112 | rotation=20, ha='left', va='bottom') 113 | plt.text(3.7, 3.2, 'clf.fit(X, y)', 114 | fontdict=fontdict, 115 | rotation=20, ha='left', va='bottom') 116 | plt.text(1.7, 1.5, 'X_new = vec.transform(input)', 117 | fontdict=fontdict, 118 | rotation=20, ha='left', va='bottom') 119 | plt.text(6.1, 1.5, 'y_new = clf.predict(X_new)', 120 | fontdict=fontdict, 121 | rotation=20, ha='left', va='bottom') 122 | 123 | 124 | def plot_unsupervised_chart(): 125 | create_base(supervised=False) 126 | 127 | 128 | if __name__ == '__main__': 129 | plot_supervised_chart(False) 130 | plot_supervised_chart(True) 131 | plot_unsupervised_chart() 132 | plt.show() 133 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot_2d_separator import plot_2d_separator 2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \ 3 | plot_regression_datasets, make_dataset 4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 5 | from .plot_interactive_tree import plot_tree_interactive 6 | from .plot_interactive_forest import plot_forest_interactive 7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters 8 | from .plot_rbf_svm_parameters import plot_svm_interactive 9 | 10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization', 11 | 'plot_linear_svc_regularization', 'plot_tree_interactive', 12 | 'plot_regression_datasets', 'make_dataset', 13 | "plot_forest_interactive", "plot_rbf_svm_parameters", 14 | "plot_svm_interactive"] 15 | -------------------------------------------------------------------------------- /utils/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | import os 4 | from sklearn.cross_validation import StratifiedShuffleSplit 5 | from sklearn.feature_extraction import DictVectorizer 6 | 7 | # Can also use pandas! 8 | def process_titanic_line(line): 9 | # Split line on "," to get fields without comma confusion 10 | vals = line.strip().split('",') 11 | # replace spurious " characters 12 | vals = [v.replace('"', '') for v in vals] 13 | pclass = int(vals[0]) 14 | survived = int(vals[1]) 15 | name = str(vals[2]) 16 | sex = str(vals[3]) 17 | try: 18 | age = float(vals[4]) 19 | except ValueError: 20 | # Blank age 21 | age = -1 22 | sibsp = float(vals[5]) 23 | parch = int(vals[6]) 24 | ticket = str(vals[7]) 25 | try: 26 | fare = float(vals[8]) 27 | except ValueError: 28 | # Blank fare 29 | fare = -1 30 | cabin = str(vals[9]) 31 | embarked = str(vals[10]) 32 | boat = str(vals[11]) 33 | homedest = str(vals[12]) 34 | line_dict = {'pclass': pclass, 'survived': survived, 'name': name, 'sex': sex, 'age': age, 'sibsp': sibsp, 35 | 'parch': parch, 'ticket': ticket, 'fare': fare, 'cabin': cabin, 'embarked': embarked, 36 | 'boat': boat, 'homedest': homedest} 37 | return line_dict 38 | 39 | 40 | def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999): 41 | f = open(os.path.join('datasets', 'titanic', 'titanic3.csv')) 42 | # Remove . from home.dest, split on quotes because some fields have commas 43 | keys = f.readline().strip().replace('.', '').split('","') 44 | lines = f.readlines() 45 | f.close() 46 | string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 47 | 'homedest'] 48 | string_keys = [s for s in string_keys if s not in feature_skip_tuple] 49 | numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare'] 50 | numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple] 51 | train_vectorizer_list = [] 52 | test_vectorizer_list = [] 53 | 54 | n_samples = len(lines) 55 | numeric_data = np.zeros((n_samples, len(numeric_keys))) 56 | numeric_labels = np.zeros((n_samples,), dtype=int) 57 | 58 | # Doing this twice is horribly inefficient but the file is small... 59 | for n, l in enumerate(lines): 60 | line_dict = process_titanic_line(l) 61 | strings = {k: line_dict[k] for k in string_keys} 62 | numeric_labels[n] = line_dict["survived"] 63 | 64 | sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size, 65 | random_state=12) 66 | # This is a weird way to get the indices but it works 67 | train_idx = None 68 | test_idx = None 69 | for train_idx, test_idx in sss: 70 | pass 71 | 72 | for n, l in enumerate(lines): 73 | line_dict = process_titanic_line(l) 74 | strings = {k: line_dict[k] for k in string_keys} 75 | if n in train_idx: 76 | train_vectorizer_list.append(strings) 77 | else: 78 | test_vectorizer_list.append(strings) 79 | numeric_data[n] = np.asarray([line_dict[k] 80 | for k in numeric_keys]) 81 | 82 | train_numeric = numeric_data[train_idx] 83 | test_numeric = numeric_data[test_idx] 84 | train_labels = numeric_labels[train_idx] 85 | test_labels = numeric_labels[test_idx] 86 | 87 | vec = DictVectorizer() 88 | # .toarray() due to returning a scipy sparse array 89 | train_categorical = vec.fit_transform(train_vectorizer_list).toarray() 90 | test_categorical = vec.transform(test_vectorizer_list).toarray() 91 | train_data = np.concatenate([train_numeric, train_categorical], axis=1) 92 | test_data = np.concatenate([test_numeric, test_categorical], axis=1) 93 | keys = numeric_keys + string_keys 94 | return keys, train_data, test_data, train_labels, test_labels 95 | 96 | 97 | FIELDNAMES = ('polarity', 'id', 'date', 'query', 'author', 'text') 98 | 99 | def read_sentiment_csv(csv_file, fieldnames=FIELDNAMES, max_count=None, 100 | n_partitions=1, partition_id=0): 101 | import csv # put the import inside for use in IPython.parallel 102 | def file_opener(csv_file): 103 | try: 104 | open(csv_file, 'r', encoding="latin1").close() 105 | return open(csv_file, 'r', encoding="latin1") 106 | except TypeError: 107 | # Python 2 does not have encoding arg 108 | return open(csv_file, 'rb') 109 | 110 | texts = [] 111 | targets = [] 112 | with file_opener(csv_file) as f: 113 | reader = csv.DictReader(f, fieldnames=fieldnames, 114 | delimiter=',', quotechar='"') 115 | pos_count, neg_count = 0, 0 116 | for i, d in enumerate(reader): 117 | if i % n_partitions != partition_id: 118 | # Skip entry if not in the requested partition 119 | continue 120 | 121 | if d['polarity'] == '4': 122 | if max_count and pos_count >= max_count / 2: 123 | continue 124 | pos_count += 1 125 | texts.append(d['text']) 126 | targets.append(1) 127 | 128 | elif d['polarity'] == '0': 129 | if max_count and neg_count >= max_count / 2: 130 | continue 131 | neg_count += 1 132 | texts.append(d['text']) 133 | targets.append(-1) 134 | 135 | return texts, targets 136 | -------------------------------------------------------------------------------- /utils/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None): 6 | if eps is None: 7 | eps = X.std() / 2. 8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 10 | xx = np.linspace(x_min, x_max, 100) 11 | yy = np.linspace(y_min, y_max, 100) 12 | 13 | X1, X2 = np.meshgrid(xx, yy) 14 | X_grid = np.c_[X1.ravel(), X2.ravel()] 15 | try: 16 | decision_values = classifier.decision_function(X_grid) 17 | levels = [0] 18 | fill_levels = [decision_values.min(), 0, decision_values.max()] 19 | except AttributeError: 20 | # no decision_function 21 | decision_values = classifier.predict_proba(X_grid)[:, 1] 22 | levels = [.5] 23 | fill_levels = [0, .5, 1] 24 | 25 | if ax is None: 26 | ax = plt.gca() 27 | if fill: 28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 29 | levels=fill_levels, colors=['blue', 'red']) 30 | else: 31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 32 | colors="black") 33 | ax.set_xlim(x_min, x_max) 34 | ax.set_ylim(y_min, y_max) 35 | ax.set_xticks(()) 36 | ax.set_yticks(()) 37 | 38 | 39 | if __name__ == '__main__': 40 | from sklearn.datasets import make_blobs 41 | from sklearn.linear_model import LogisticRegression 42 | X, y = make_blobs(centers=2, random_state=42) 43 | clf = LogisticRegression().fit(X, y) 44 | plot_2d_separator(clf, X, fill=True) 45 | plt.scatter(X[:, 0], X[:, 1], c=y) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /utils/plot_digits_datasets.py: -------------------------------------------------------------------------------- 1 | # Taken from example in scikit-learn examples 2 | # Authors: Fabian Pedregosa 3 | # Olivier Grisel 4 | # Mathieu Blondel 5 | # Gael Varoquaux 6 | # License: BSD 3 clause (C) INRIA 2011 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from matplotlib import offsetbox 11 | from sklearn import (manifold, datasets, decomposition, ensemble, 12 | random_projection) 13 | try: 14 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda 15 | except ImportError: 16 | from sklearn import lda 17 | 18 | def digits_plot(): 19 | digits = datasets.load_digits(n_class=6) 20 | n_digits = 500 21 | X = digits.data[:n_digits] 22 | y = digits.target[:n_digits] 23 | n_samples, n_features = X.shape 24 | n_neighbors = 30 25 | 26 | def plot_embedding(X, title=None): 27 | x_min, x_max = np.min(X, 0), np.max(X, 0) 28 | X = (X - x_min) / (x_max - x_min) 29 | 30 | plt.figure() 31 | ax = plt.subplot(111) 32 | for i in range(X.shape[0]): 33 | plt.text(X[i, 0], X[i, 1], str(digits.target[i]), 34 | color=plt.cm.Set1(y[i] / 10.), 35 | fontdict={'weight': 'bold', 'size': 9}) 36 | 37 | if hasattr(offsetbox, 'AnnotationBbox'): 38 | # only print thumbnails with matplotlib > 1.0 39 | shown_images = np.array([[1., 1.]]) # just something big 40 | for i in range(X.shape[0]): 41 | dist = np.sum((X[i] - shown_images) ** 2, 1) 42 | if np.min(dist) < 1e5: 43 | # don't show points that are too close 44 | # set a high threshold to basically turn this off 45 | continue 46 | shown_images = np.r_[shown_images, [X[i]]] 47 | imagebox = offsetbox.AnnotationBbox( 48 | offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), 49 | X[i]) 50 | ax.add_artist(imagebox) 51 | plt.xticks([]), plt.yticks([]) 52 | if title is not None: 53 | plt.title(title) 54 | 55 | n_img_per_row = 10 56 | img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row)) 57 | for i in range(n_img_per_row): 58 | ix = 10 * i + 1 59 | for j in range(n_img_per_row): 60 | iy = 10 * j + 1 61 | img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8)) 62 | 63 | plt.imshow(img, cmap=plt.cm.binary) 64 | plt.xticks([]) 65 | plt.yticks([]) 66 | plt.title('A selection from the 64-dimensional digits dataset') 67 | print("Computing PCA projection") 68 | pca = decomposition.PCA(n_components=2).fit(X) 69 | X_pca = pca.transform(X) 70 | plot_embedding(X_pca, "Principal Components projection of the digits") 71 | plt.figure() 72 | plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray") 73 | plt.axis('off') 74 | plt.figure() 75 | plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray") 76 | plt.axis('off') 77 | plt.show() 78 | -------------------------------------------------------------------------------- /utils/plot_interactive_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | 8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 9 | 10 | 11 | def plot_forest(max_depth=1): 12 | plt.figure() 13 | ax = plt.gca() 14 | h = 0.02 15 | 16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 19 | 20 | if max_depth != 0: 21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth, 22 | random_state=1).fit(X, y) 23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 24 | Z = Z.reshape(xx.shape) 25 | ax.contourf(xx, yy, Z, alpha=.4) 26 | ax.set_title("max_depth = %d" % max_depth) 27 | else: 28 | ax.set_title("data set") 29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 30 | ax.set_xlim(x_min, x_max) 31 | ax.set_ylim(y_min, y_max) 32 | ax.set_xticks(()) 33 | ax.set_yticks(()) 34 | 35 | 36 | def plot_forest_interactive(): 37 | from IPython.html.widgets import interactive, IntSlider 38 | slider = IntSlider(min=0, max=8, step=1, value=0) 39 | return interactive(plot_forest, max_depth=slider) 40 | -------------------------------------------------------------------------------- /utils/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from sklearn.externals.six import StringIO # doctest: +SKIP 8 | from sklearn.tree import export_graphviz 9 | from scipy import ndimage 10 | try: 11 | from scipy.misc import imread 12 | except ImportError: 13 | from scipy.ndimage import imread 14 | 15 | import re 16 | 17 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 18 | 19 | 20 | def tree_image(tree, fout=None): 21 | try: 22 | import pydot 23 | except ImportError: 24 | # make a hacky white plot 25 | x = np.ones((10, 10)) 26 | x[0, 0] = 0 27 | return x 28 | dot_data = StringIO() 29 | export_graphviz(tree, out_file=dot_data) 30 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue()) 31 | data = re.sub(r"samples = [0-9]+\\n", "", data) 32 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 33 | 34 | graph = pydot.graph_from_dot_data(data) 35 | if fout is None: 36 | fout = "tmp.png" 37 | graph.write_png(fout) 38 | return imread(fout) 39 | 40 | 41 | def plot_tree(max_depth=1): 42 | fig, ax = plt.subplots(1, 2, figsize=(15, 7)) 43 | h = 0.02 44 | 45 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 46 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 48 | 49 | if max_depth != 0: 50 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y) 51 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 52 | Z = Z.reshape(xx.shape) 53 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 54 | faces = faces.reshape(xx.shape) 55 | border = ndimage.laplace(faces) != 0 56 | ax[0].contourf(xx, yy, Z, alpha=.4) 57 | ax[0].scatter(xx[border], yy[border], marker='.', s=1) 58 | ax[0].set_title("max_depth = %d" % max_depth) 59 | ax[1].imshow(tree_image(tree)) 60 | ax[1].axis("off") 61 | else: 62 | ax[0].set_title("data set") 63 | ax[1].set_visible(False) 64 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 65 | ax[0].set_xlim(x_min, x_max) 66 | ax[0].set_ylim(y_min, y_max) 67 | ax[0].set_xticks(()) 68 | ax[0].set_yticks(()) 69 | 70 | 71 | def plot_tree_interactive(): 72 | from IPython.html.widgets import interactive, IntSlider 73 | slider = IntSlider(min=0, max=8, step=1, value=0) 74 | return interactive(plot_tree, max_depth=slider) 75 | -------------------------------------------------------------------------------- /utils/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | def make_dataset(n_samples=100): 8 | rnd = np.random.RandomState(42) 9 | x = np.linspace(-3, 3, n_samples) 10 | y_no_noise = np.sin(4 * x) + x 11 | y = y_no_noise + rnd.normal(size=len(x)) 12 | return x, y 13 | 14 | 15 | def plot_regression_datasets(): 16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 17 | for n_samples, ax in zip([10, 100, 1000], axes): 18 | x, y = make_dataset(n_samples) 19 | ax.plot(x, y, 'o', alpha=.6) 20 | 21 | 22 | def plot_kneighbors_regularization(): 23 | rnd = np.random.RandomState(42) 24 | x = np.linspace(-3, 3, 100) 25 | y_no_noise = np.sin(4 * x) + x 26 | y = y_no_noise + rnd.normal(size=len(x)) 27 | X = x[:, np.newaxis] 28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 29 | 30 | x_test = np.linspace(-3, 3, 1000) 31 | 32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 34 | kneighbor_regression.fit(X, y) 35 | ax.plot(x, y_no_noise, label="true function") 36 | ax.plot(x, y, "o", label="data") 37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 38 | label="prediction") 39 | ax.legend() 40 | ax.set_title("n_neighbors = %d" % n_neighbors) 41 | 42 | if __name__ == "__main__": 43 | plot_kneighbors_regularization() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /utils/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def plot_linear_svc_regularization(): 9 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 10 | # a carefully hand-designed dataset lol 11 | y[7] = 0 12 | y[27] = 0 13 | 14 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 15 | 16 | for ax, C in zip(axes, [1e-2, 1, 1e2]): 17 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 18 | 19 | svm = SVC(kernel='linear', C=C).fit(X, y) 20 | plot_2d_separator(svm, X, ax=ax, eps=.5) 21 | ax.set_title("C = %f" % C) 22 | -------------------------------------------------------------------------------- /utils/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def make_handcrafted_dataset(): 9 | # a carefully hand-designed dataset lol 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | y[np.array([7, 27])] = 0 12 | mask = np.ones(len(X), dtype=np.bool) 13 | mask[np.array([0, 1, 5, 26])] = 0 14 | X, y = X[mask], y[mask] 15 | return X, y 16 | 17 | 18 | def plot_rbf_svm_parameters(): 19 | X, y = make_handcrafted_dataset() 20 | 21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 22 | for ax, C in zip(axes, [1e0, 5, 10, 100]): 23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 24 | 25 | svm = SVC(kernel='rbf', C=C).fit(X, y) 26 | plot_2d_separator(svm, X, ax=ax, eps=.5) 27 | ax.set_title("C = %f" % C) 28 | 29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3)) 30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]): 31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y) 33 | plot_2d_separator(svm, X, ax=ax, eps=.5) 34 | ax.set_title("gamma = %f" % gamma) 35 | 36 | 37 | def plot_svm(log_C, log_gamma): 38 | X, y = make_handcrafted_dataset() 39 | C = 10. ** log_C 40 | gamma = 10. ** log_gamma 41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 42 | ax = plt.gca() 43 | plot_2d_separator(svm, X, ax=ax, eps=.5) 44 | # plot data 45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 46 | # plot support vectors 47 | sv = svm.support_vectors_ 48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3) 49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 50 | 51 | 52 | def plot_svm_interactive(): 53 | from IPython.html.widgets import interactive, FloatSlider 54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 57 | --------------------------------------------------------------------------------