├── .gitignore ├── Clustering.ipynb ├── Features.ipynb ├── LICENSE.md ├── Outline.ipynb ├── Pipelines.ipynb ├── README.md ├── TrainTest.ipynb ├── WhatIsML.ipynb ├── data ├── FakeLearning.csv ├── HumansLearning_X.csv ├── HumansLearning_y.csv ├── Learning about Humans learning ML.csv ├── cancer-pipeline.pkl ├── linear_failure.csv └── pets.csv ├── environment-nographviz.yml ├── environment.yml ├── img ├── Anscombe_quartet.png ├── DBSCAN.png ├── DBSCAN.webp ├── DataDino-600x455.gif ├── KFold.png ├── L1_and_L2_balls.png ├── ML-Wikipedia.png ├── README ├── basic-perceptron.png ├── ca-clusters.png ├── inception-v3.png ├── open-notebook.png ├── pipeline-diagram.png ├── ridge_regression_geomteric.png ├── sklearn-topics.png ├── supervised_workflow.png └── train_test_split_matrix.png ├── requirements.txt └── src ├── .ipynb_checkpoints ├── classifier_comparison-checkpoint.py ├── decisiontree_regressor-checkpoint.py ├── linear_failure-checkpoint.py ├── plot_cluster_comparison-checkpoint.py └── time_regressors-checkpoint.py ├── .ropeproject ├── config.py ├── globalnames ├── history └── objectdb ├── __init__.py ├── classifier_comparison.py ├── decisiontree_regressor.py ├── fakify_humans.py ├── linear_failure.py ├── mglearn ├── __init__.py ├── datasets.py ├── make_blobs.py ├── plot_2d_separator.py ├── plot_agglomerative.py ├── plot_animal_tree.py ├── plot_cross_validation.py ├── plot_decomposition.py ├── plot_grid_search.py ├── plot_helpers.py ├── plot_improper_preprocessing.py ├── plot_interactive_tree.py ├── plot_kmeans.py ├── plot_kneighbors_regularization.py ├── plot_knn_classification.py ├── plot_knn_regression.py ├── plot_linear_regression.py ├── plot_linear_svc_regularization.py ├── plot_metrics.py ├── plot_nmf.py ├── plot_nn_graphs.py ├── plot_pca.py ├── plot_rbf_svm_parameters.py ├── plot_scaling.py ├── plot_tree_nonmonotonous.py ├── plots.py └── tools.py ├── over_under_fit.py ├── plot_cluster_comparison.py └── time_regressors.py /.gitignore: -------------------------------------------------------------------------------- 1 | ML.html 2 | ML_files/* 3 | external/* 4 | .ipynb_checkpoints/* 5 | **/.ropeproject/** 6 | **/.ipynb_checkpoints/** 7 | **/__pycache__/** 8 | .DS_STORE 9 | **/*.swp 10 | **/*.pyc 11 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## creative commons 2 | 3 | # Attribution-NonCommercial-ShareAlike 4.0 International 4 | 5 | Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible. 6 | 7 | ### Using Creative Commons Public Licenses 8 | 9 | Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses. 10 | 11 | * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors). 12 | 13 | * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees). 14 | 15 | ## Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License 16 | 17 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 18 | 19 | ### Section 1 – Definitions. 20 | 21 | a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 22 | 23 | b. __Adapter's License__ means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 24 | 25 | c. __BY-NC-SA Compatible License__ means a license listed at [creativecommons.org/compatiblelicenses](http://creativecommons.org/compatiblelicenses), approved by Creative Commons as essentially the equivalent of this Public License. 26 | 27 | d. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 28 | 29 | e. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 30 | 31 | f. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 32 | 33 | g. __License Elements__ means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike. 34 | 35 | h. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 36 | 37 | i. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 38 | 39 | h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License. 40 | 41 | i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 42 | 43 | j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 44 | 45 | k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 46 | 47 | l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 48 | 49 | ### Section 2 – Scope. 50 | 51 | a. ___License grant.___ 52 | 53 | 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 54 | 55 | A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 56 | 57 | B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 58 | 59 | 2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 60 | 61 | 3. __Term.__ The term of this Public License is specified in Section 6(a). 62 | 63 | 4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 64 | 65 | 5. __Downstream recipients.__ 66 | 67 | A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 68 | 69 | B. __Additional offer from the Licensor – Adapted Material.__ Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply. 70 | 71 | C. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 72 | 73 | 6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 74 | 75 | b. ___Other rights.___ 76 | 77 | 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 78 | 79 | 2. Patent and trademark rights are not licensed under this Public License. 80 | 81 | 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 82 | 83 | ### Section 3 – License Conditions. 84 | 85 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 86 | 87 | a. ___Attribution.___ 88 | 89 | 1. If You Share the Licensed Material (including in modified form), You must: 90 | 91 | A. retain the following if it is supplied by the Licensor with the Licensed Material: 92 | 93 | i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 94 | 95 | ii. a copyright notice; 96 | 97 | iii. a notice that refers to this Public License; 98 | 99 | iv. a notice that refers to the disclaimer of warranties; 100 | 101 | v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 102 | 103 | B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 104 | 105 | C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 106 | 107 | 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 108 | 109 | 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 110 | 111 | b. ___ShareAlike.___ 112 | 113 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply. 114 | 115 | 1. The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License. 116 | 117 | 2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material. 118 | 119 | 3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply. 120 | 121 | ### Section 4 – Sui Generis Database Rights. 122 | 123 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 124 | 125 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 126 | 127 | b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and 128 | 129 | c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 130 | 131 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 132 | 133 | ### Section 5 – Disclaimer of Warranties and Limitation of Liability. 134 | 135 | a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__ 136 | 137 | b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__ 138 | 139 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 140 | 141 | ### Section 6 – Term and Termination. 142 | 143 | a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 144 | 145 | b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 146 | 147 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 148 | 149 | 2. upon express reinstatement by the Licensor. 150 | 151 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 152 | 153 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 154 | 155 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 156 | 157 | ### Section 7 – Other Terms and Conditions. 158 | 159 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 160 | 161 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 162 | 163 | ### Section 8 – Interpretation. 164 | 165 | a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 166 | 167 | b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 168 | 169 | c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 170 | 171 | d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 172 | 173 | > Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. 174 | > 175 | > Creative Commons may be contacted at creativecommons.org 176 | -------------------------------------------------------------------------------- /Outline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "duration": "20 minutes" 7 | }, 8 | "source": [ 9 | "# Intermediate Machine Learning with scikit-learn" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "duration": "20 minutes" 16 | }, 17 | "source": [ 18 | "## Resources\n", 19 | "\n", 20 | "This training material is available under a CC BY-NC-SA 4.0 license. You can find it at:\n", 21 | "\n", 22 | "> https://github.com/DavidMertz/ML-Live-Intermediate\n", 23 | "\n", 24 | "Before attending this course, please configure the environments you will need. Within the repository, find the file `requirements.txt` to install software using `pip`, or the file `environment.yml` to install software using `conda`.\n", 25 | "\n", 26 | "Please contact me and my training company, [KDM Training](http://kdm.training) for hands-on, instructor-led, onsite or remote, training. Our email is info@kdm.training." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## What Is Machine Learning?\n", 34 | "\n", 35 | "\n", 36 | "The session *Beginner Machine Learning with `scikit-learn`* addresses the topics outlines here in more detail. For this course, we will only cover a quick overview of these topics. That course also covers the main topics in *supervised* machine learning: classification, regression, and hyperparameters.\n", 37 | "\n", 38 | "* Overview of techniques used in Machine Learning\n", 39 | "* Classification vs. Regression vs. Clustering\n", 40 | "* Dimensionality Reduction\n", 41 | "* Feature Engineering\n", 42 | "* Feature Selection\n", 43 | "* Categorical vs. Ordinal vs. Continuous variables\n", 44 | "* One-hot encoding\n", 45 | "* Hyperparameters\n", 46 | "* Grid Search\n", 47 | "* Metrics\n", 48 | "\n", 49 | "
" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Clustering\n", 57 | "\n", 58 | "* Overview of (some) clustering algorithms\n", 59 | "* Kmeans clustering\n", 60 | "* Agglomerative clustering\n", 61 | "* Density based clustering: DBSan and HDBScan\n", 62 | "* n_clusters, labels, and predictions\n", 63 | "* Visualizing results\n", 64 | "\n", 65 | "
" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Feature Engineering and Feature Selection\n", 73 | "* Principal Component Analysis (PCA)\n", 74 | "* Non-Negative Matrix Factorization (NMF)\n", 75 | "* Latent Dirichlet Allocation (LDA)\n", 76 | "* Independent component analysis (ICA)\n", 77 | "* SelectKBest\n", 78 | "* Dimensionality expansion\n", 79 | "* Polynomial Features\n", 80 | "* One-Hot Encoding\n", 81 | "* Scaling with StandardScaler, RobustScaler, MinMaxScaler, Normalizer, and others\n", 82 | "* Binning values with quantiles or binarize\n", 83 | "\n", 84 | "
" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Pipelines\n", 92 | "\n", 93 | "* Feature Selection and Engineering\n", 94 | "* Grid search\n", 95 | "* Model\n", 96 | "\n", 97 | "
" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Robust Train/Test Splits \n", 105 | "\n", 106 | "* cross_val_score\n", 107 | "* ShuffleSplit\n", 108 | "* KFold, RepeatedKFold, LeaveOneOut, LeavePOut, StratifiedKFold\n", 109 | "\n", 110 | "
" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.1" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intermediate Machine Learning with scikit-learn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "When you have explored a series of steps that is useful for your task—or for a related family of tasks—you would like to package those in a less ad hoc and more reusable way. Certainly wrapping a set of steps in a simple factory function is not difficult. But for most flexibility it is best to take advantage of scikit-learn **pipelines**." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Some libraries tend to be in flux for their dependency versions\n", 24 | "import warnings\n", 25 | "warnings.simplefilter(\"ignore\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Imperative Sequential Processing" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Let us repeat a similar construction of building and training a model as we have seen previously. Here we carry out the steps imperatively, in the sequence we worked out in previous lessons." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "hide_input": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "%matplotlib inline\n", 51 | "import numpy as np\n", 52 | "import matplotlib.pyplot as plt\n", 53 | "import pandas as pd\n", 54 | "\n", 55 | "# Some libraries tend to be in flux for their dependency versions\n", 56 | "import warnings\n", 57 | "warnings.simplefilter(\"ignore\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### Load the data\n", 65 | "\n", 66 | "First load the data; this step will not become part of the pipeline since loading the initial data may occur in various ways that are use dependent. We will use a cross validation score rather than do a train/test split up front. In the next lesson we will see how this can be more robust than a simple train/test split." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from sklearn.datasets import load_breast_cancer\n", 76 | "cancer = load_breast_cancer()\n", 77 | "\n", 78 | "# From here on, we refer to features and target by the\n", 79 | "# generic X, y rather than tie it to the dataset\n", 80 | "X, y = cancer.data, cancer.target\n", 81 | "X.shape" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "### Generate Synthetic Features\n", 89 | "\n", 90 | "We think the model may perform better with polynomial features that get at the interactions of multiple features." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "from sklearn.preprocessing import PolynomialFeatures\n", 100 | "poly = PolynomialFeatures(2)\n", 101 | "X_poly = poly.fit_transform(X)\n", 102 | "X_poly.shape" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "### Scale the Data\n", 110 | "\n", 111 | "Scale the data for better performance in subsequent models." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "from sklearn.preprocessing import MinMaxScaler\n", 121 | "\n", 122 | "# compute minimum and maximum on the training data\n", 123 | "scaler = MinMaxScaler()\n", 124 | "scaler.fit(X_poly)\n", 125 | "# rescale training data\n", 126 | "X_poly_scaled = scaler.transform(X_poly)\n", 127 | "X_poly_scaled.shape" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Select Most Important Engineered Features\n", 135 | "\n", 136 | "Since we have increased the number of features to an unweildy number, let us select only the top most important few." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "from sklearn.feature_selection import SelectPercentile\n", 146 | "\n", 147 | "select = SelectPercentile(percentile=20)\n", 148 | "select.fit(X_poly_scaled, y)\n", 149 | "X_selected = select.transform(X_poly_scaled)\n", 150 | "X_selected.shape" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "### Test Feature Engineered Data Against Model\n", 158 | "\n", 159 | "Having gone through those steps, we would like to see how our engineered dataset performs on a model that showed some success in earlier lessons." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "from sklearn.ensemble import RandomForestClassifier\n", 169 | "rfc = RandomForestClassifier(max_depth=7, random_state=1)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "from sklearn.metrics import f1_score, make_scorer\n", 179 | "scorer = make_scorer(f1_score)\n", 180 | "\n", 181 | "from sklearn.model_selection import KFold\n", 182 | "kf = KFold(5, random_state=0)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "from sklearn.model_selection import cross_val_score\n", 192 | "cv_scores = cross_val_score(rfc, X_selected, y, scoring=scorer, cv=kf)\n", 193 | "print(\" CV scores:\", cv_scores)\n", 194 | "print(\"Mean score:\", np.mean(cv_scores))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "As a sanity check, let us see how we would have performed on the raw data. This gets a moderate but significant improvement over the raw data in F1 score." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "cv_scores = cross_val_score(rfc, X, y, scoring=scorer, cv=kf)\n", 211 | "print(\"Raw data CV scores:\", cv_scores)\n", 212 | "print(\" Raw mean score:\", np.mean(cv_scores))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Using Pipelines\n", 220 | "\n", 221 | "A pipeline is simply an abstraction in scikit-learn to bundle together steps like those used above into a single model interface, following the same APIs as a model itself. A particular pipeline is likely to be somewhat domain specific in that you may learn that those particular steps are useful for e.g. cancer data, but not as useful for data with very different characteristics." 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "\"Pipeline\n", 229 | "\n", 230 | "Image credit (CC-BY-NA): [Karl Rosaen](http://karlrosaen.com/ml/learning-log/2016-06-20/)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### Putting it together\n", 238 | "\n", 239 | "We can easily construct a pipeline consisting of just those steps (instances that follow the scorer interface) we want. When we instantiate the various classes, we are free to pass in parameters we know we will want; these likely reflect our previous exploration of the particular domain and its datasets." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "from sklearn.pipeline import Pipeline\n", 249 | "\n", 250 | "pipe = Pipeline([\n", 251 | " (\"Polynomial features\", PolynomialFeatures(2)),\n", 252 | " (\"MinMax scaling\", MinMaxScaler()),\n", 253 | " (\"Top 20% features\", SelectPercentile(percentile=20)),\n", 254 | " (\"Random Forest\", RandomForestClassifier(max_depth=7)),\n", 255 | "])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Using the pipeline object is just like using a plain model, but all the preparation steps are bundled into a single object." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "cv_scores = cross_val_score(pipe, \n", 272 | " X, y, \n", 273 | " scoring=make_scorer(f1_score), \n", 274 | " cv=KFold(5))\n", 275 | "\n", 276 | "print(\" Pipeline CV scores:\", cv_scores)\n", 277 | "print(\"Pipeline mean score:\", np.mean(cv_scores))" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "We can recover (and even modify in-place) the steps of a pipeline" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "pipe.steps" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# We can serialize the pipeline for later use\n", 303 | "from pickle import dump, load\n", 304 | "dump(pipe, open('data/cancer-pipeline.pkl','wb'))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "newpipe = load(open('data/cancer-pipeline.pkl','rb'))\n", 314 | "cv_scores = cross_val_score(newpipe, \n", 315 | " X, y, \n", 316 | " scoring=make_scorer(f1_score), \n", 317 | " cv=KFold(5))\n", 318 | "\n", 319 | "print(\" Pipeline CV scores:\", cv_scores)\n", 320 | "print(\"Pipeline mean score:\", np.mean(cv_scores))" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "pipe.fit(X, y)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "pipe.predict(X)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "#### A pipeline factory\n", 346 | "\n", 347 | "There is a convenience function to create pipelines. The only difference of interest with the class constructor is that names of steps are generated for you rather than being explicitly spelled when you create a pipeline. This is slightly more convenient but also takes away your option of giving more descriptive names for steps." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "from sklearn.pipeline import make_pipeline\n", 357 | "pipe = make_pipeline(\n", 358 | " PolynomialFeatures(2),\n", 359 | " MinMaxScaler(),\n", 360 | " SelectPercentile(percentile=20),\n", 361 | " RandomForestClassifier(max_depth=7))\n", 362 | "pipe.steps" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Pipelines with Grid Search\n", 370 | "\n", 371 | "A very nice feature of using a pipeline is that it \"plays well\" with a grid search. In fact, you are not restrained to searching over the hyperparameters of the model step, but can also search over arguments to other steps in the pipeline. For this, spelling is a little easier if we use the generated step names from `make_pipeline()`." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "%%time\n", 381 | "# Takes about a minute for this grid search\n", 382 | "from sklearn.model_selection import GridSearchCV\n", 383 | "\n", 384 | "params = {'polynomialfeatures__degree': [1, 2, 3],\n", 385 | " 'selectpercentile__percentile': [10, 15, 20, 50],\n", 386 | " 'randomforestclassifier__max_depth': [5, 7, 9],\n", 387 | " 'randomforestclassifier__criterion': ['entropy', 'gini']}\n", 388 | "\n", 389 | "grid = GridSearchCV(pipe, param_grid=params, cv=5)\n", 390 | "grid.fit(X, y)\n", 391 | "\n", 392 | "print(\"best cross-validation accuracy:\", grid.best_score_)\n", 393 | "print(\"best dataset score: \", grid.score(X, y)) # Overfitting against entire dataset\n", 394 | "print(\"best parameters: \", grid.best_params_)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "model = grid.best_estimator_\n", 404 | "cv_scores = cross_val_score(model, \n", 405 | " X, y, \n", 406 | " scoring=make_scorer(f1_score), \n", 407 | " cv=KFold(5))\n", 408 | "\n", 409 | "print(\" Grid CV scores:\", cv_scores)\n", 410 | "print(\"Grid mean score:\", np.mean(cv_scores))" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "The model we select as `.best_estimator_` is itself a pipeline; it simply has been re-parameterized from the original pipeline, using the grid search." 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "model.steps" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "We can examine the relative success of all the parameter combinations as well. As we saw in a prior lesson, `.cv_results_` contains a rich collection of information beyond this also. Although the highest degree polynomial features and highest percentage feature selection was the best estimator, in the ranking of classifiers, there is quite a bit of variation in all the parameters searched. In particular, entirely different combinations perform only slightly worse in the example (they are close enough that it might turn out differently among the top few with different random seeds)." 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "df_grid = pd.DataFrame(grid.cv_results_).set_index('rank_test_score').sort_index()\n", 443 | "df_params = df_grid.loc[:,df_grid.columns.str.contains('param_')]\n", 444 | "cols = [c.split('_')[-1] for c in df_params.columns]\n", 445 | "df_params.columns = cols\n", 446 | "df_params.head(10)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "## Next lesson\n", 454 | "\n", 455 | "**Robust train/test splits**: In this lesson we looked at the very useful pipeline interface provided by scikit-learn. Using pipelines greatly aids in making models and processing steps reproducible and easy to distribute to colleagues (or to yourself with later projects).\n", 456 | "\n", 457 | "The next and final lesson of this course, on robust train/test splits, will look at a variety of capabilities in scikit-learn for performing divisions between training and validation data that go beyond the basic function we used in most of these lessons.\n", 458 | "\n", 459 | "" 460 | ] 461 | } 462 | ], 463 | "metadata": { 464 | "kernelspec": { 465 | "display_name": "Python 3", 466 | "language": "python", 467 | "name": "python3" 468 | }, 469 | "language_info": { 470 | "codemirror_mode": { 471 | "name": "ipython", 472 | "version": 3 473 | }, 474 | "file_extension": ".py", 475 | "mimetype": "text/x-python", 476 | "name": "python", 477 | "nbconvert_exporter": "python", 478 | "pygments_lexer": "ipython3", 479 | "version": "3.7.1" 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 2 484 | } 485 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## About the course 2 | 3 | This repository **WAS** for use with the Pearson Publishing live webinar "Intermediate Machine Learning with `scikit-learn`." Versions of this material are used by other training provided by David Mertz and [KDM Training](http://kdm.training). 4 | 5 | All this content has migrated to [ML-Webinar](https://github.com/DavidMertz/ML-Webinar) repo that unifies the several connected sessions of the same training program. 6 | 7 | The content here will not be deleted for a good while, but any improvements, additions, etc. will only live at the unified repository. 8 | -------------------------------------------------------------------------------- /TrainTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intermediate Machine Learning with scikit-learn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Avoiding Overfitting\n", 15 | "\n", 16 | "The reason for train/test splits of data is always, at heart, a desire to avoid overfitting. It is straightforward in supervised learning problems to fit a model against all the available data. Since we, by definition, do not yet have the data we do not have, we want a proxy for \"the novel data we might see in the future.\"\n", 17 | "\n", 18 | "Obviously, the best proxy we can come up with is simply a portion of the original data that did not participate in the fitting of the model. We rely on an assumption that our sample data is similar to observations we will obtain in the future. However, there is really nothing better we might choose as such a proxy.\n", 19 | "\n", 20 | "Using `train_test_split()` to divide the data between a training and testing set if a very reasonable approach. By default, this utility function shuffles the data before splitting it; in general this will minimize effects related to order of collection or collation of the dataset. However, especially on moderate sized datasets of hundreds or thousands of samples (but not really of tens of thousands, or millions), the particular accident of a randomized split can still lead to artifacts." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Understanding splitting\n", 38 | "\n", 39 | "`train_test_split()` performs just one split of a data array, while all the other splitting classes in `sklearn.model_selection` produce an iterator over multiple distinct splits. \n", 40 | "\n", 41 | "We will use the Iris dataset to illustrate these difference. This dataset contains 150 observations of 3 different species of Iris, each sample containing 4 features. It is a widely used example, and responds well to many classifiers." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from sklearn.model_selection import KFold, train_test_split\n", 51 | "from sklearn import datasets" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "In order to show some different behavior of splitting techniques, we will modify the Iris data to drop some of it. In particular, we truncate the last 25 observations. The reason we do this is because the samples in the dataset are grouped by their class, first all the Iris setosa, then all the Iris virginica, and finally all the Iris versicolor samples. The truncation will create an imbalance among the classes of observations. Most datasets you will encountere will have varying numbers of samples in different classes." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "iris = datasets.load_iris()\n", 68 | "iris.data = iris.data[:-25]\n", 69 | "iris.target = iris.target[:-25]\n", 70 | "print(iris.target_names)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The basic utility function—as we have seen in prior lessons—simply divides the data into two arrays. We keep the same number of columns as in the original, but put some of the rows in the first array and the rest in the second." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "[arr.shape for arr in train_test_split(iris.data)]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "If more than one array is passed to the function, it will split each one in turn; the split will be consistent in choosing the corresponding rows from each array (which must, therefore, all have a consistent number of rows).\n", 94 | "\n", 95 | "99% of the time, you use this behavior to simultaneously split features and target arrays. In principle, the API of `train_test_split()` does nothing to enforce that usage. In fact, in optionally taking more than two arguments, it is not constrained to this specific use." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import numpy as np\n", 105 | "for arr in train_test_split(iris.data, iris.target, np.ones((125,3))):\n", 106 | " print(arr.shape)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Multiple splitting\n", 114 | "\n", 115 | "Classes for splitting create iterators over multiple splits using the same general algorithm for splitting. " 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "#### KFold\n", 123 | "\n", 124 | "One of the simplest such techniques is `KFold`. This simply divides the data into multiple \"folds.\" By default, `KFold` does not shuffle the data first; therefore, if the dataset is meaningfully ordered in some manner already, the folds may have importantly different characteristics. \n", 125 | "\n", 126 | "The potential differences among the folds can be good or bad, depending on your purpose. Either way, be aware of it. If you hope your model will generalize to sample collections with a characteristic not in the training, there is an advantage to not shuffling. However, it equivalently means that a particular loop through fitting may not have the opportuntity to fit to data with that characteristic.\n", 127 | "\n", 128 | "" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Below we loop through a three-way split of the (truncated) Iris data. The lengths of the the training versus testing data are slightly different between iterations simply because 125 is not divisible by 3." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "for n, (train, test) in enumerate(KFold(n_splits=3).split(iris.data)):\n", 145 | " print(\"Iteration: %d; Train shape: %s; Test shape: %s\" % (\n", 146 | " n, train.shape, test.shape))" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "One thing that might be surprising at first is that the shape of training arrays are not, e.g. `(83, 4)`. What we iterate over is a collections of index positions into the underlying NumPy arrays. So, for example, in the first iteration, the test data is the first 1/3rd of the rows in the data." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "train, test = next(KFold(n_splits=3).split(iris.data))\n", 163 | "print(test)\n", 164 | "pd.DataFrame(iris.data[test], columns=iris.feature_names).tail()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### StratifiedKFold\n", 172 | "\n", 173 | "This cross-validation object is a variation of `KFold` that returns stratified folds. The folds are made by preserving the percentage of samples for each class. Because this split is sensitive to the classes of the target, it must take a `y` argument to the `.split()` method." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "from sklearn.model_selection import StratifiedKFold\n", 183 | "\n", 184 | "skf = StratifiedKFold(n_splits=3).split(iris.data, iris.target)\n", 185 | "for n, (train, test) in enumerate(skf):\n", 186 | " print(\"Iteration: %d; Train shape: %s; Test shape: %s\" % (\n", 187 | " n, train.shape, test.shape))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Notice that the index positions generated for the first split are not successive from the head. Rather there are 17 each from the first and second 50 samples, then 9 more from the last 25 samples. Other folds are similar, with rounding producing slightly different counts." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "skf = StratifiedKFold(n_splits=3).split(iris.data, iris.target)\n", 204 | "train, test = next(skf)\n", 205 | "print(test)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "### LeaveOneOut\n", 213 | "\n", 214 | "This splitting technique utilizes the maximum possible size for each training set which still creating a nominal testing set. This can be useful to train models as completely as possible while still allowing validation. Of course, this iterates over a number of splits equal to the number of samples, so is the most computationally spendy split possible also." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "from sklearn.model_selection import LeaveOneOut\n", 224 | "loo = LeaveOneOut().split(iris.data)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "all_folds = []\n", 234 | "for n, (train, test) in enumerate(loo):\n", 235 | " all_folds.append((n, train, test))\n", 236 | "\n", 237 | "n, train, test = all_folds[0]\n", 238 | "print(\"Iteration: %d; Train shape: %s; Test shape: %s; Test index: %s\" % (\n", 239 | " n, train.shape, test.shape, test))\n", 240 | "print(\"...\")\n", 241 | "n, train, test = all_folds[-1]\n", 242 | "print(\"Iteration: %d; Train shape: %s; Test shape: %s; Test index: %s\" % (\n", 243 | " n, train.shape, test.shape, test))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "### GroupKFold\n", 251 | "\n", 252 | "A `KFold` variant with non-overlapping groups. The same group will not appear in two different folds (the number of distinct groups has to be at least equal to the number of folds).\n", 253 | "\n", 254 | "The folds are approximately balanced in the sense that the number of distinct groups is approximately the same in each fold." 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "from sklearn.model_selection import GroupKFold\n", 264 | "\n", 265 | "gkf = GroupKFold(n_splits=3).split(iris.data, groups=iris.target)\n", 266 | "for n, (train, test) in enumerate(gkf):\n", 267 | " print(\"Iteration: %d; Train shape: %s; Test shape: %s\" % (\n", 268 | " n, train.shape, test.shape))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "# Verify that the final test group really is a homogeneous class\n", 278 | "print(\"Index postitions:\", test)\n", 279 | "print(\"Species:\", [iris.target_names[n] for n in iris.target[test]])" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Cross validation\n", 287 | "\n", 288 | "The splitters discussed in this lesson are only a few of those in scikit-learn. A variety of others build on the general idea contained in those discussed. Consult the [documentation](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection) for details on each.\n", 289 | "\n", 290 | "The point of all these splitters is almost universally to be used in conjunction with cross validation. The function `cross_val_score()` performs repeated training and scoring relative to muliple train/test splits. " 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "# As we mentioned, the Iris dataset is quite easy to fit\n", 300 | "from sklearn.model_selection import cross_val_score\n", 301 | "from sklearn.svm import SVC\n", 302 | "clf = SVC(kernel='linear', C=1)\n", 303 | "scores = cross_val_score(clf, iris.data, iris.target, cv=5)\n", 304 | "scores " 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "If an integer argument is given for `cv`, as above, it performs a Stratified KFold with that number of folds. But we can also give an iterable like one of the scorers discussed. \n", 312 | "\n", 313 | "The \"score\" given for each iteration is that produced by the `.score()` method of the estimator being used. You can manually specify a different `scorer=my_scorer` parameter to `cross_val_score` if you want to use a different metric." 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "This is an artificially bad splitting strategy. We train exclusively on two species on each iteration, then try to predict the excluded species." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "cross_val_score(clf, iris.data, iris.target, \n", 330 | " cv=GroupKFold(n_splits=3), groups=iris.target)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "A less bad split for this particular dataset and classifier would be a basic KFold. The data has an unequal number of samples from each class (by construction) and is orderd by class. So this gets enough overlap to do well on some splits, but does poorly on others.\n", 338 | "\n", 339 | "In generally, Stratified KFold is pretty robust, and hence is the default strategy used by `cross_val_score`." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "cross_val_score(clf, iris.data, iris.target, cv=KFold(n_splits=3))" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "loo_cv = cross_val_score(clf, iris.data, iris.target, cv=LeaveOneOut())\n", 358 | "print(\"Mean leave-one-out cross validation:\", np.mean(loo_cv))\n", 359 | "print(\"All scores:\\n\", loo_cv)" 360 | ] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.6.6" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 2 384 | } 385 | -------------------------------------------------------------------------------- /WhatIsML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "duration": "1.5 hours" 7 | }, 8 | "source": [ 9 | "# Intermediate Machine Learning with scikit-learn" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Getting Started with This Course\n", 17 | "\n", 18 | "Let us take a look at how we will install the software and learning materials needed for this course...\n", 19 | "\n", 20 | "> https://github.com/DavidMertz/ML-Live-Intermediate\n", 21 | "\n", 22 | "
" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## What Is Machine Learning?\n", 30 | "\n", 31 | "> **\"If you torture the data enough, nature will always confess.\"** –Ronald Coase\n", 32 | "\n", 33 | "As a one line version—not entirely original—I like to think of machine learning as \"statistics on steroids.\" That characterization may be more cute than is necessary, but it is a good start. Others have used phrases like \"extracting knowledge from raw data by computational means.\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## What Is scikit-learn?\n", 41 | "\n", 42 | "Scikit-learn provides a large range of algorithms in machine learning that are unified under a common and intuitive API. Most of the dozens of classes provided for various kinds of models share the large majority of the same calling interface. Very often—as we will see in examples below—you can easily substitute one algorithm for another with nearly no change in your underlying code. This allows you to explore the problem space quickly, and often arrive at an optimal, or at least satisficing$^1$ approach to your problem domain or datasets.\n", 43 | "\n", 44 | "* Simple and efficient tools for data mining and data analysis\n", 45 | "* Accessible to everybody, and reusable in various contexts\n", 46 | "* Built on NumPy, SciPy, and matplotlib\n", 47 | "* Open source, commercially usable - BSD license\n", 48 | "\n", 49 | "
\n", 50 | "\n", 51 | "$^1$Satisficing is a decision-making strategy of searching through the alternatives until an acceptability threshold is met. It is a portmanteau of satisfy and suffice, and was introduced by Herbert A. Simon in 1956. He maintained that many natural problems are characterized by computational intractability or a lack of information, both of which preclude the use of mathematical optimization procedures." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Overview of Techniques Used in Machine Learning\n", 59 | "\n", 60 | "The diagram below is from the scikit-learn documentation, but the same general schematic of different techniques and algorithms that it outlines applies equally to any other library. The classes represented in bubbles mostly will have equivalent versions in other libraries.\n", 61 | "\n", 62 | "![Scikit-learn topic areas](img/sklearn-topics.png)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Classification versus Regression versus Clustering\n", 70 | "\n", 71 | "### Classification\n", 72 | "\n", 73 | "Classification is a type of supervised learning in which the targets for a prediction are a set of categorical values.\n", 74 | "\n", 75 | "### Regression\n", 76 | "\n", 77 | "Regression is a type of supervised learning in which the targets for a prediction are quantitative or continuous values.\n", 78 | "\n", 79 | "### Clustering\n", 80 | "\n", 81 | "Clustering is a type of unsupervised learning where you want to identify similarities among collections of items without an *a prior* classification scheme. You may or may not have an *a priori* about the number of categories." 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Overfitting and Underfitting\n", 89 | "\n", 90 | "In machine learning models, we have to worry about twin concerns. On the one hand, we might **overfit** our model to the dataset we have available. If we train a model extremely accurately against the data itself, metrics we use for the quality of the model will probably show high values. However, in this scenario, the model is unlikely to extend well to novel data, which is usually the entire point of developing a model and making predictions. By training in a fine tuned way against one dataset, we might have done nothing more than memorize that collection of values; or at least memorize a spurious pattern that exists in that particular sample data collection.\n", 91 | "\n", 92 | "To some extent (but not completely), overfitting is mitigated by larger dataset sizes.\n", 93 | "\n", 94 | "In contrast, if we choose a model that simply does not have the degree of detail necessary to represent the underlying real-world phenomenon, we get an **underfit** model. In this scenario, we *smooth too much* in our simplification of the data into a model.\n", 95 | "\n", 96 | "Some illustrations are useful." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "from src.over_under_fit import doc, show" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "doc()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "show()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "The above example is for a regression, but the same concept applies to categorization or clustering problems. For example:" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "!touch src/__init__.py" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from src.over_under_fit import cluster" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "First let's look at a collection of points about which we have no *a priori* of their clustering." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# \"Cluster\" everything into just one category\n", 165 | "cluster(1)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "To the human eye, it would seem reasonable to guess that this represents three categories of observations. Therefore, we can reasonable say that this data is **underfit** by our clustering model. Indeed, that would also be true if we guessed there were two clusters." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# Guess there might be two categories\n", 182 | "cluster(2)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "This model is not terrible, and it indeed seems to identify an important difference in the data. But looking at the base-line known values for the categories, we can see it really is three types:" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "# Show the \"known true\" categories\n", 199 | "cluster(1, known=True)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "If we cluster into three categories algorithmically, we almost (but not quite) recover the underlying truth. The algorithms puts the categories in arbitrary order, so the colors are rotated; but you can seem that most-but-not-all the points are in the same clusters." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "cluster(3)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "Moving farther along, if we guessed *more* clusters we would start to **overfit** the data, and impute category distinctions that do not exist in the underlying dataset. In this case we known the true number because we have specifically generated it as such. In real-world data we usually do not know this in advance, so we can only tell by performing various validations on the strength of the fit." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "# Guess there might be 5 categories\n", 232 | "cluster(5)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# Guess there might be 15 categories\n", 242 | "cluster(15)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Dimensionality Reduction\n", 250 | "\n", 251 | "Dimensionality reduction is most often a technique used to assist with other techniques. By reducing a large number of features to relatively few features; very often other techniques are more successful relative to these transformed synthetic features. Sometimes the dimensionality reduction itself is sufficient to identify the \"main gist\" or your data." 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Feature Engineering\n", 259 | "\n", 260 | "Very often, the \"features\" we are given in our original data are not those that will prove most useful in our final analysis. It is often necessary to identify \"the data inside the data.\" Sometimes feature engineering can be as simple as normalizing the distribution of values. Other times it can involve creating synthetic features out of two or more raw features." 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "## Feature Selection\n", 268 | "\n", 269 | "Often, the features you have in your raw data contain some features with little to no predictive or analytic value. Identifying and excluding irrelevant features often improves the quality of a model." 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "## Categorical versus Ordinal versus Continuous Variables\n", 277 | "\n", 278 | "Features come in one of three basic types.\n", 279 | "\n", 280 | "### Categorical variables \n", 281 | "\n", 282 | "Some are **categorical** (also called nominal): A discrete set of values that a feature may assume, often named by words or codes (but sometimes confusingly as integers where an order may be misleadingly implied).\n", 283 | "\n", 284 | "### Ordinal variables\n", 285 | "\n", 286 | "Some are **ordinal**: There is a scale from low to high in the data values, but the spacing in the data may have little to no relationship to the underlying phenomenon. For example, while an airline or credit card \"reward program\" might have levels of Gold/Silver/Platinum/Diamond, there is probably no real sense in which Diamond is \"4 times as much\" as Gold, even though they are encoded as 1-4.\n", 287 | "\n", 288 | "### Continuous variables\n", 289 | "\n", 290 | "Some are **continuous** or quantitative: Some quantity is actually measured such that a number represents the amount of it. The distribution of these measurements is likely not to be uniform and linear (in which case scaling might be relevant), but there is a real thing being measured. Measurements might be quantized for continuous variables, but that does not necessarily make them ordinal instead. For example, we might measure annual rainfall in each town only to the nearest inch, and hence have integers for that feature.\n", 291 | "\n", 292 | "This notion of types of variables applies to statistics broadly. Some other concepts are genuinely specific to machine learning. " 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## One-hot Encoding\n", 300 | "\n", 301 | "For many machine learning algorithms, including neural networks, it is more useful to have a categorical feature with N possible values encoded as N features, each taking a binary value. Several tools, including a couple functions in scikit-learn will transform raw datasets into this format. Obviously, by encoding this way, dimensionality is increased.\n", 302 | "\n", 303 | "Let us illustrate using a toy test dataset. The following whimsical data is suggested in a blog post by [Håkon Hapnes Strand](https://www.quora.com/What-is-one-hot-encoding-and-when-is-it-used-in-data-science). Imagine we collected some data on individual organisms—namely taxonomic class, height, and lifespan. Depending on our purpose, we might use this data for either supervised or unsupervised learning techniques (if we had a lot more observations, and a number more features)." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "data= [\n", 313 | " ['human', 1.7, 85],\n", 314 | " ['alien', 1.8, 92],\n", 315 | " ['penguin', 1.2, 37],\n", 316 | " ['octopus', 2.3, 25],\n", 317 | " ['alien', 1.7, 85],\n", 318 | " ['human', 1.2, 37],\n", 319 | " ['octopus', 0.4, 8],\n", 320 | " ['human', 2.0, 97]\n", 321 | "]" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# The data with its original feature, just as a DataFrame\n", 331 | "import pandas as pd\n", 332 | "naive = pd.DataFrame(data, columns=['species', 'height (M)', 'lifespan (years)'])\n", 333 | "naive" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "# The data one-hot encoded\n", 343 | "encoded = pd.get_dummies(naive)\n", 344 | "encoded.columns = [c.replace('species_','') for c in encoded.columns]\n", 345 | "encoded" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "## Hyperparameters\n", 353 | "\n", 354 | "The notion of parameters was introduced to define the way in which a model was trained. For neural networks, parameters are the weights of all the connections between the neurons. But in other models a similar parameterization exists. For example, in a basic linear regression, the coefficients in each dimension are parameters of the trained/fitted model.\n", 355 | "\n", 356 | "However, many algorithms used in machine learning take \"hyperparameters\" that tune how the training itself occurs. These may be cut-off values where a \"good enough\" estimate is obtained, for example. Or there may be hidden terms in an underlying equation that can be set. Or an algorithm may actually be a family of closely related algorithms, and a hyperparameter chooses among them. Models in scikit-learn typically have a number of hyperparameters to set before they are trained (with \"sensible\" defaults when you do not specify)." 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Grid Search\n", 364 | "\n", 365 | "While scikit-learn usually provides \"sensible\" defaults for hyperparameters, there is often a great deal of domain and dataset specificity for which hyperparameters are most effective. An API is provided to search across the combinatorial space of hyperparameter values and evaluate each collection." 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "## Metrics\n", 373 | "\n", 374 | "After you have trained a model, the big question is \"how good\" is the model. There is a lot of nuance to answering that question, and correspondingly a large number of measures and techniques.\n", 375 | "\n", 376 | "One common technique to look at a combination of successes and failure in a machine learning model is a *confusion matrix*. Let us look at an example, picking up the whimsical data used above. Suppose we wanted to guess the taxonomic class of an observed organism and our model had these results:\n", 377 | "\n", 378 | "| Predict/Actual | Human | Octopus | Penguin |\n", 379 | "|----------------|----------|----------|----------|\n", 380 | "| Human | **5** | 0 | 2 |\n", 381 | "| Octopus | 3 | **3** | 3 |\n", 382 | "| Penguin | 0 | 1 | **11** |\n", 383 | "\n", 384 | "Giving a single number to describe *how good* the model is is not immediately obvious. The model is very good at predicting penguins, but it gets rather bad when it predicts octopi. In fact, if the model predicts something is an octopus, it probably isn't (only 1/3rd of such predictions are accurate)." 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "### Accuracy versus Precision versus Recall\n", 392 | "\n", 393 | "Naïvely, we might simply ask about the \"accuracy\" of a model (at least for classification tasks). This is simply the number of *right* answers divided by the number of data points. In our example, we have 28 observations of organisms, and 19 were classified accurately, so that's a **68%** accuracy. Again though, the accuracy varies quite a lot if we restrict it to just one class of the predictions. For our multi-class labels, this may not be a bad measure. \n", 394 | "\n", 395 | "Consider a binary problem though:\n", 396 | "\n", 397 | "| Predict/Actual | Positive | Negative |\n", 398 | "|----------------|----------|----------|\n", 399 | "| Positive | 1 | 0 |\n", 400 | "| Negative | 2 | 997 | \n", 401 | "\n", 402 | "Calculating *accuracy*, we find that this model is **99.8%** accurate! That seems pretty good until you think of this test as a medical screening for a fatal disease. *Two thirds of the people who actually have the disease will be judged free of it by this model* (and hence perhaps not be treated for the condition); that isn't such a happy real-world result.\n", 403 | "\n", 404 | "
\n", 405 | "\n", 406 | "In contrast with accuracy, the \"precision\" of a model is defined as:\n", 407 | "\n", 408 | "$$\\text{Precision} = \\frac{true\\: positive}{true\\: positive + false\\: positive}$$\n", 409 | "\n", 410 | "Generalizing that to the multi-class case, the formula is as follows (for i being the index of the class):\n", 411 | "\n", 412 | "$$\\text{Precision}_{i} = \\cfrac{M_{ii}}{\\sum_i M_{ij}}$$\n", 413 | "\n", 414 | "Applying that to our hypothetical medical screening, we get a a precision of **1.0**. We cannot do better than that. The problem is with \"recall\" which is defined as:\n", 415 | "\n", 416 | "$$\\text{Recall} = \\frac{true\\: positive}{true\\: positive + false\\: negative}$$\n", 417 | "\n", 418 | "Generalizing that to the multi-class case:\n", 419 | "\n", 420 | "$$\\text{Recall}_{i} = \\cfrac{M_{ii}}{\\sum_j M_{ij}}$$\n", 421 | "\n", 422 | "Here we do much worse by having a recall of **33.3%** in our medical diagnosis case! This is obviously a terrible result if we care about recall." 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "### F1 Score\n", 430 | "\n", 431 | "There are several different algorithms that attempt to *blend* precision and recall to product a single \"score.\" Scikit-learn provides a number of other scalar scores that are useful for differing purposes (and other libraries are similar), but F1 score is one that is used very frequently. It is simply:\n", 432 | "\n", 433 | "$$\\text{F1} = 2 \\times \\cfrac{precision \\times recall}{precision + recall}$$\n", 434 | "\n", 435 | "Applying that to our medical diagnostic model, we get an F1 score of 50%. Still not good, but we account for the high precision to some extent. For intermediate cases, the F1 score provides good balance.\n", 436 | "\n", 437 | "F1 score can be generalized to multi-class models by averaging the F1 score across each class, counting only correct/incorrect per class." 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "### Code Examples" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "from sklearn.metrics import confusion_matrix\n", 454 | "import numpy as np\n", 455 | "\n", 456 | "y_true = [\"human\", \"octopus\", \"human\", \"human\", \"octopus\", \"penguin\", \"penguin\"]\n", 457 | "y_pred = [\"octopus\", \"octopus\", \"human\", \"human\", \"octopus\", \"human\", \"penguin\"]\n", 458 | "labels = ['octopus', 'penguin', 'human']" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "cm = confusion_matrix(y_true, y_pred, labels=labels)\n", 468 | "print(\"Confusion Matrix (predict/actual):\\n\", \n", 469 | " pd.DataFrame(cm, index=labels, columns=labels), sep=\"\")\n", 470 | "\n", 471 | "recall = np.diag(cm) / np.sum(cm, axis=1)\n", 472 | "print(\"\\nRecall:\\n\", pd.Series(recall, index=labels), sep=\"\")\n", 473 | "\n", 474 | "precision = np.diag(cm) / np.sum(cm, axis=0)\n", 475 | "print(\"\\nPrecision:\\n\", pd.Series(precision, index=labels), sep=\"\")\n", 476 | "\n", 477 | "print(\"\\nAccuracy:\\n\", np.sum(np.diag(cm)) / np.sum(cm))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "In this particular case, F1 score is very close to accuracy. In fact, using the \"micro\" averaging method reduces the result to accuracy. Using the \"macro\" averaging makes it equivalent to a NumPy reduction from the formula given." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "from sklearn.metrics import f1_score\n", 494 | "weighted_f1 = f1_score(y_true, y_pred, average=\"weighted\")\n", 495 | "print(\"\\nF1 score:\\n\", weighted_f1, sep=\"\")" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "print(\"Naive averaging F1 score:\", np.mean(2*(recall*precision)/(recall+precision)))\n", 505 | "print(\" sklearn macro averaging:\", f1_score(y_true, y_pred, average=\"macro\"))" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "## Next Lesson\n", 513 | "\n", 514 | "**Clustering**: In the first content notebook of this course, we will look at clustering algorithms available in scikit-learn and explore some general concepts around the goals and validation of clustering.\n", 515 | "\n", 516 | "" 517 | ] 518 | } 519 | ], 520 | "metadata": { 521 | "kernelspec": { 522 | "display_name": "Python 3", 523 | "language": "python", 524 | "name": "python3" 525 | }, 526 | "language_info": { 527 | "codemirror_mode": { 528 | "name": "ipython", 529 | "version": 3 530 | }, 531 | "file_extension": ".py", 532 | "mimetype": "text/x-python", 533 | "name": "python", 534 | "nbconvert_exporter": "python", 535 | "pygments_lexer": "ipython3", 536 | "version": "3.7.1" 537 | } 538 | }, 539 | "nbformat": 4, 540 | "nbformat_minor": 2 541 | } 542 | -------------------------------------------------------------------------------- /data/HumansLearning_X.csv: -------------------------------------------------------------------------------- 1 | ,Experience,Age,Education,Fav_lang_C++,Fav_lang_JavaScript,Fav_lang_MATLAB,Fav_lang_Python,Fav_lang_R,Fav_lang_Scala,Fav_lang_Whitespace,Fav_movie_And Now for Something Completely Different,Fav_movie_Monty Python Live at the Hollywood Bowl,Fav_movie_Monty Python and the Holy Grail,Fav_movie_Monty Python's Life of Brian,Fav_movie_Monty Python's The Meaning of Life,Fav_movie_Time Bandits,Sklearn_Nope.,Sklearn_Yep!,Humans_Machines_Skynet is a WINNER!,Humans_Machines_Team Humans!,Fav_Game_Chess,Fav_Game_Go,Fav_Game_Longing for the sweet release of death,"Fav_Game_Tic-tac-toe (Br. Eng. ""noughts and crosses"")" 2 | 0,20.0,53,12,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1 3 | 1,4.0,33,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 4 | 2,1.0,31,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 5 | 3,12.0,60,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1 6 | 4,7.0,48,6,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0 7 | 5,3.0,32,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0 8 | 6,3.0,34,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 9 | 7,5.0,24,6,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0 10 | 8,4.0,34,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 11 | 9,0.0,32,5,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 12 | 10,2.0,46,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 13 | 11,1.0,27,6,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0 14 | 12,0.2,43,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0 15 | 13,5.0,32,7,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 16 | 14,2.0,41,8,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0 17 | 15,7.0,28,6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0 18 | 16,4.0,47,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0 19 | 17,2.0,27,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0 20 | 18,4.0,37,4,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0 21 | 19,2.0,60,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1 22 | 20,2.0,41,6,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 23 | 21,3.0,27,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0 24 | 22,2.0,40,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 25 | 23,4.0,29,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1 26 | 24,3.0,28,2,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 27 | 25,2.0,36,5,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0 28 | 26,1.0,28,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1 29 | 27,2.0,46,8,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0 30 | 28,3.0,31,4,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 31 | 29,27.0,63,8,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 32 | 30,10.0,99,7,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1 33 | 31,3.0,48,6,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0 34 | 32,3.0,24,4,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0 35 | 33,4.0,28,6,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0 36 | 34,4.0,25,7,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0 37 | 35,3.0,35,4,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 38 | 36,2.0,46,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 39 | 37,4.0,22,3,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0 40 | 38,1.0,22,4,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0 41 | 39,2.0,48,12,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1 42 | 40,8.0,33,10,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 43 | 41,2.0,41,9,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0 44 | 42,1.0,31,10,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1 45 | 43,1.0,39,7,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0 46 | 44,7.0,24,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 47 | 45,1.0,31,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0 48 | 46,3.0,28,6,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0 49 | 47,2.0,40,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0 50 | 48,1.0,36,10,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 51 | 49,1.0,42,5,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0 52 | 50,25.0,25,6,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0 53 | 51,0.0,37,4,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0 54 | 52,1.0,36,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 55 | 53,10.0,3,10,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0 56 | 54,2.0,46,4,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0 57 | 55,12.0,30,6,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0 58 | 56,2.0,46,8,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0 59 | 57,6.0,49,10,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0 60 | 58,2.0,33,2,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 61 | 59,2.0,60,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1 62 | 60,12.0,46,10,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0 63 | 61,1.0,36,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0 64 | 62,8.0,38,7,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 65 | 63,6.0,51,6,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1 66 | 64,1.0,28,4,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1 67 | 65,4.0,32,5,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0 68 | 66,2.0,25,5,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 69 | 67,5.0,31,4,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0 70 | 68,1.0,24,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 71 | 69,5.0,53,6,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0 72 | 70,5.0,27,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0 73 | 71,0.0,34,7,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0 74 | 72,1.0,36,6,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0 75 | 73,3.0,20,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0 76 | 74,1.0,27,10,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0 77 | 75,3.0,30,6,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0 78 | 76,5.0,39,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 79 | 77,0.0,37,12,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0 80 | 78,5.0,30,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0 81 | 79,1.0,32,3,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 82 | 80,8.0,52,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0 83 | 81,7.0,12,-10,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0 84 | 82,3.0,29,6,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0 85 | 83,7.0,37,10,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0 86 | 84,5.0,34,3,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1 87 | 85,3.0,3,10,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0 88 | 86,2.0,32,9,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0 89 | 87,0.0,31,10,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0 90 | 88,2.0,27,4,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1 91 | 89,12.0,46,10,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 92 | 90,4.0,51,8,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0 93 | 91,0.0,41,8,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0 94 | 92,1.0,40,8,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0 95 | 93,4.0,40,5,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0 96 | 94,2.0,39,8,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1 97 | 95,1.0,57,10,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 98 | 96,0.5,23,4,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0 99 | 97,2.0,23,4,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0 100 | 98,8.0,41,5,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 101 | 99,1.0,27,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0 102 | 100,1.0,47,6,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1 103 | 101,5.0,49,4,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 104 | 102,2.0,32,4,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0 105 | 103,1.0,26,6,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0 106 | 104,1.0,26,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0 107 | 105,1.0,29,6,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0 108 | 106,3.0,30,6,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0 109 | 107,0.5,52,6,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0 110 | 108,7.0,26,4,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0 111 | 109,0.5,27,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0 112 | 110,3.0,30,2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0 113 | 111,1.0,34,6,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0 114 | 112,4.0,35,6,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0 115 | 113,3.0,44,6,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0 116 | 114,25.0,76,23,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0 117 | 115,25.0,75,12,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0 118 | -------------------------------------------------------------------------------- /data/HumansLearning_y.csv: -------------------------------------------------------------------------------- 1 | True 2 | True 3 | True 4 | False 5 | True 6 | True 7 | False 8 | False 9 | True 10 | False 11 | False 12 | False 13 | False 14 | False 15 | False 16 | True 17 | True 18 | True 19 | True 20 | False 21 | True 22 | True 23 | False 24 | True 25 | True 26 | True 27 | True 28 | True 29 | True 30 | True 31 | False 32 | True 33 | False 34 | True 35 | True 36 | False 37 | False 38 | True 39 | True 40 | True 41 | True 42 | True 43 | False 44 | False 45 | False 46 | False 47 | False 48 | False 49 | True 50 | False 51 | False 52 | True 53 | False 54 | True 55 | True 56 | True 57 | True 58 | True 59 | False 60 | False 61 | True 62 | True 63 | True 64 | True 65 | False 66 | True 67 | True 68 | False 69 | True 70 | True 71 | False 72 | False 73 | False 74 | False 75 | False 76 | False 77 | False 78 | False 79 | True 80 | True 81 | True 82 | False 83 | True 84 | False 85 | True 86 | False 87 | False 88 | False 89 | True 90 | False 91 | True 92 | True 93 | True 94 | True 95 | False 96 | False 97 | True 98 | False 99 | False 100 | True 101 | False 102 | False 103 | False 104 | True 105 | True 106 | True 107 | False 108 | False 109 | False 110 | True 111 | False 112 | False 113 | False 114 | True 115 | False 116 | True 117 | -------------------------------------------------------------------------------- /data/Learning about Humans learning ML.csv: -------------------------------------------------------------------------------- 1 | Timestamp,Favorite programming language,Favorite Monty Python movie,Years of Python experience,Have used Scikit-learn,Age,"In the Terminator franchise, did you root for the humans or the machines?",Which is the better game?,Years of post-secondary education (e.g. BA=4; Ph.D.=10),How successful has this tutorial been so far? 2 | 4/8/2018 8:34:08,Python,Monty Python's Life of Brian,20,Yep!,53,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",12,8 3 | 4/8/2018 9:57:15,Python,Monty Python and the Holy Grail,4,Yep!,33,Team Humans!,Chess,5,9 4 | 4/8/2018 9:57:35,Python,Monty Python and the Holy Grail,1,Yep!,31,Team Humans!,Chess,10,10 5 | 4/8/2018 9:57:58,Python,Monty Python and the Holy Grail,12,Yep!,60,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",10,6 6 | 4/8/2018 9:58:05,Python,Monty Python's The Meaning of Life,7,Yep!,48,Team Humans!,Go,6,10 7 | 4/8/2018 9:58:07,Python,Monty Python's Life of Brian,3,Yep!,32,Team Humans!,Longing for the sweet release of death,0,10 8 | 4/8/2018 9:59:58,Python,Monty Python and the Holy Grail,3,Yep!,34,Team Humans!,Chess,10,4 9 | 4/8/2018 10:01:11,Python,Monty Python's The Meaning of Life,5,Yep!,24,Team Humans!,Chess,6,3 10 | 4/8/2018 10:02:51,Python,Monty Python and the Holy Grail,4,Yep!,34,Team Humans!,Chess,10,10 11 | 4/8/2018 10:04:37,R,Monty Python and the Holy Grail,0,Yep!,32,Team Humans!,Chess,5,4 12 | 4/8/2018 10:10:25,Python,Monty Python and the Holy Grail,2,Yep!,46,Team Humans!,Chess,4,3 13 | 4/8/2018 10:11:14,Python,And Now for Something Completely Different,1,Yep!,27,Team Humans!,Longing for the sweet release of death,6,7 14 | 4/8/2018 10:13:49,Python,Monty Python and the Holy Grail,0.2,Yep!,43,Skynet is a WINNER!,Chess,4,3 15 | 4/8/2018 13:05:00,Python,Monty Python and the Holy Grail,5,Nope.,32,Team Humans!,Chess,7,7 16 | 4/8/2018 10:20:32,Python,And Now for Something Completely Different,2,Yep!,41,Skynet is a WINNER!,Chess,8,4 17 | 4/8/2018 10:21:07,Python,Monty Python's Life of Brian,7,Yep!,28,Team Humans!,Chess,6,8 18 | 4/8/2018 10:21:35,Python,Monty Python's Life of Brian,4,Yep!,47,Team Humans!,Chess,4,10 19 | 4/8/2018 10:21:48,Python,Monty Python's Life of Brian,2,Yep!,27,Skynet is a WINNER!,Go,4,8 20 | 4/8/2018 10:21:54,Python,Monty Python's The Meaning of Life,4,Yep!,37,Team Humans!,Chess,4,10 21 | 4/8/2018 10:21:57,Python,Time Bandits,2,Yep!,60,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",4,5 22 | 4/8/2018 10:23:06,Python,Monty Python and the Holy Grail,2,Yep!,41,Team Humans!,Chess,6,8 23 | 4/8/2018 10:23:06,Python,Monty Python and the Holy Grail,3,Yep!,27,Team Humans!,Longing for the sweet release of death,BA=5,9 24 | 4/8/2018 10:23:24,Python,Monty Python and the Holy Grail,2,Yep!,40,Team Humans!,Chess,5,4 25 | 4/8/2018 10:27:05,Python,Monty Python and the Holy Grail,4,Yep!,29,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",10,10 26 | 4/8/2018 10:27:23,Python,Monty Python and the Holy Grail,3,Yep!,28,Team Humans!,Chess,2,10 27 | 4/8/2018 10:35:52,R,Monty Python and the Holy Grail,2,Yep!,36,Skynet is a WINNER!,Chess,5,8 28 | 4/8/2018 11:06:09,Python,Monty Python and the Holy Grail,1,Yep!,28,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",BA = 4,9 29 | 4/8/2018 11:11:09,R,Time Bandits,2,Nope.,46,Team Humans!,Chess,8,8 30 | 4/8/2018 11:39:30,Python,Monty Python and the Holy Grail,3,Nope.,31,Team Humans!,Chess,4,8 31 | 4/8/2018 12:05:16,Python,Monty Python and the Holy Grail,27,Yep!,63,Team Humans!,Chess,8,9 32 | 4/8/2018 12:12:14,Python,Monty Python's The Meaning of Life,10,Yep!,99,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",7,5 33 | 4/8/2018 12:13:46,Python,Monty Python's Life of Brian,3,Nope.,48,Skynet is a WINNER!,Longing for the sweet release of death,6,10 34 | 4/8/2018 12:47:20,R,Monty Python and the Holy Grail,3,Yep!,24,Team Humans!,Longing for the sweet release of death,4,6 35 | 4/8/2018 12:47:51,Python,Monty Python and the Holy Grail,4,Yep!,28,Team Humans!,Longing for the sweet release of death,6,8 36 | 4/8/2018 12:52:07,Python,Monty Python and the Holy Grail,4,Yep!,25,Team Humans!,Go,7,8 37 | 4/8/2018 12:56:57,R,Monty Python and the Holy Grail,3,Yep!,35,Team Humans!,Chess,4,5 38 | 4/8/2018 13:04:58,Python,Monty Python and the Holy Grail,2,Yep!,46,Team Humans!,Chess,4,3 39 | 4/8/2018 13:05:10,Python,Monty Python and the Holy Grail,4,Yep!,22,Team Humans!,Longing for the sweet release of death,3,10 40 | 4/8/2018 13:05:10,Python,Monty Python and the Holy Grail,1,Nope.,22,Team Humans!,Longing for the sweet release of death,4,8 41 | 4/8/2018 13:05:13,Python,Monty Python's The Meaning of Life,2,Yep!,48,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",12,9 42 | 4/8/2018 13:05:14,Python,Monty Python and the Holy Grail,8,Nope.,33,Team Humans!,Chess,10,10 43 | 4/8/2018 13:05:18,Python,Monty Python's The Meaning of Life,2,Yep!,41,Skynet is a WINNER!,Longing for the sweet release of death,9,8 44 | 4/8/2018 13:05:27,MATLAB,Time Bandits,1,Nope.,31,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",10,7 45 | 4/8/2018 13:05:32,Python,Monty Python and the Holy Grail,1,Nope.,39,Team Humans!,Longing for the sweet release of death,7,7 46 | 4/8/2018 13:05:36,Python,Monty Python and the Holy Grail,7,Yep!,24,Team Humans!,Chess,10,5 47 | 4/8/2018 13:05:43,Python,Monty Python and the Holy Grail,1,Yep!,31,Skynet is a WINNER!,Chess,10,3 48 | 4/8/2018 13:05:47,R,And Now for Something Completely Different,3,Yep!,28,Skynet is a WINNER!,Chess,6,7 49 | 4/8/2018 13:05:48,Python,Monty Python's Life of Brian,2,Yep!,40,Team Humans!,Chess,4,3 50 | 4/8/2018 13:07:24,MATLAB,Monty Python and the Holy Grail,1,Nope.,36,Team Humans!,Chess,10,8 51 | 4/8/2018 13:05:57,Scala,Monty Python's The Meaning of Life,1,Nope.,42,Skynet is a WINNER!,Chess,5,1 52 | 4/8/2018 13:05:59,Python,And Now for Something Completely Different,25,Nope.,25,Skynet is a WINNER!,Chess,6,7 53 | 4/8/2018 13:06:02,R,Monty Python's The Meaning of Life,0,Yep!,37,Team Humans!,Chess,4,8 54 | 4/8/2018 13:06:02,Python,Monty Python and the Holy Grail,1,Yep!,36,Team Humans!,Chess,4,6 55 | 4/8/2018 13:06:02,Python,Monty Python's The Meaning of Life,10,Yep!,3,Team Humans!,Chess,10,9 56 | 4/8/2018 13:06:05,Python,Time Bandits,2,Nope.,46,Skynet is a WINNER!,Chess,4,9 57 | 4/8/2018 13:06:05,Python,Monty Python's Life of Brian,12,Yep!,30,Team Humans!,Chess,6,9 58 | 4/8/2018 13:08:58,R,Time Bandits,2,Nope.,46,Team Humans!,Chess,8,10 59 | 4/8/2018 13:06:08,Python,Monty Python's The Meaning of Life,6,Yep!,49,Team Humans!,Go,10,8 60 | 4/8/2018 13:06:08,Python,Monty Python and the Holy Grail,2,Yep!,33,Team Humans!,Chess,2,7 61 | 4/8/2018 13:06:09,Python,Time Bandits,2,Yep!,60,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",4,3 62 | 4/8/2018 13:06:13,Python,Monty Python and the Holy Grail,12,Nope.,46,Team Humans!,Go,10,8 63 | 4/8/2018 13:06:13,Python,Monty Python's Life of Brian,1,Yep!,36,Team Humans!,Chess,4,8 64 | 4/8/2018 13:06:14,C++,Monty Python and the Holy Grail,8,Nope.,38,Team Humans!,Chess,7,8 65 | 4/8/2018 13:06:20,Python,Monty Python's The Meaning of Life,6,Yep!,51,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",6,8 66 | 4/8/2018 13:06:20,Python,Monty Python's The Meaning of Life,1,Nope.,28,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",4,7 67 | 4/8/2018 13:06:20,Python,Monty Python and the Holy Grail,4,Nope.,32,Skynet is a WINNER!,Chess,5,8 68 | 4/8/2018 13:06:20,Python,Monty Python and the Holy Grail,2,Yep!,25,Team Humans!,Chess,5,8 69 | 4/8/2018 13:06:21,Python,And Now for Something Completely Different,5,Yep!,31,Team Humans!,Longing for the sweet release of death,4,7 70 | 4/8/2018 13:06:22,Python,Monty Python and the Holy Grail,1,Yep!,24,Team Humans!,Chess,4,8 71 | 4/8/2018 13:06:22,Python,Monty Python and the Holy Grail,5,Nope.,53,Team Humans!,Longing for the sweet release of death,6,8 72 | 4/8/2018 13:06:23,Python,Monty Python and the Holy Grail,5,Yep!,27,Skynet is a WINNER!,Chess,4,7 73 | 4/8/2018 13:06:25,R,And Now for Something Completely Different,0,Nope.,34,Team Humans!,Chess,7,4 74 | 4/8/2018 13:06:26,MATLAB,And Now for Something Completely Different,1,Yep!,36,Team Humans!,Chess,6,5 75 | 4/8/2018 13:06:31,Python,Time Bandits,3,Yep!,20,Team Humans!,Longing for the sweet release of death,0,5 76 | 4/8/2018 13:06:32,Python,And Now for Something Completely Different,1,Yep!,27,Team Humans!,Longing for the sweet release of death,10,5 77 | 4/8/2018 13:06:36,Python,Monty Python's The Meaning of Life,3,Nope.,30,Skynet is a WINNER!,Chess,6,4 78 | 4/8/2018 13:06:37,Python,Monty Python and the Holy Grail,5,Yep!,39,Team Humans!,Chess,10,4 79 | 4/8/2018 13:24:32,Python,And Now for Something Completely Different,0,Yep!,37,Team Humans!,Chess,12,7 80 | 4/8/2018 13:06:42,Python,Monty Python and the Holy Grail,5,Yep!,30,Team Humans!,Longing for the sweet release of death,4,10 81 | 4/8/2018 13:06:52,JavaScript,Monty Python and the Holy Grail,1,Nope.,32,Team Humans!,Chess,3,9 82 | 4/8/2018 13:06:53,Python,Monty Python's The Meaning of Life,8,Yep!,52,Team Humans!,Chess,0,8 83 | 4/8/2018 13:06:53,Whitespace,Monty Python's The Meaning of Life,7,Yep!,12,Team Humans!,Longing for the sweet release of death,-10,7 84 | 4/8/2018 13:06:53,Python,Monty Python and the Holy Grail,3,Yep!,29,Skynet is a WINNER!,Longing for the sweet release of death,6,8 85 | 4/8/2018 13:06:54,MATLAB,Time Bandits,7,Yep!,37,Team Humans!,Chess,10,4 86 | 4/8/2018 13:06:56,Python,Monty Python's Life of Brian,5,Yep!,34,Skynet is a WINNER!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",3,10 87 | 4/8/2018 13:06:57,Python,Monty Python's The Meaning of Life,3,Nope.,3,Team Humans!,Chess,10,6 88 | 4/8/2018 13:07:06,Python,And Now for Something Completely Different,2,Yep!,32,Team Humans!,Go,9,6 89 | 4/8/2018 13:07:09,Python,Time Bandits,0,Nope.,31,Team Humans!,Chess,10,7 90 | 4/8/2018 13:07:10,R,Monty Python and the Holy Grail,2,Yep!,27,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",4,8 91 | 4/8/2018 13:07:16,Python,Monty Python and the Holy Grail,12,Yep!,46,Team Humans!,Chess,10,7 92 | 4/8/2018 13:07:17,Python,Monty Python and the Holy Grail,4,Nope.,51,Team Humans!,Go,8,9 93 | 4/8/2018 13:07:18,Python,Monty Python Live at the Hollywood Bowl,0,Nope.,41,Team Humans!,Chess,8,8 94 | 4/8/2018 13:07:21,Python,Monty Python and the Holy Grail,1,Nope.,40,Team Humans!,Go,8,8 95 | 4/8/2018 13:07:24,Python,Monty Python's Life of Brian,4,Nope.,40,Skynet is a WINNER!,Chess,5,10 96 | 4/8/2018 13:07:26,Python,Monty Python and the Holy Grail,2,Yep!,39,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",8,4 97 | 4/8/2018 13:07:27,C++,Monty Python and the Holy Grail,1,Nope.,57,Team Humans!,Chess,10,7 98 | 4/8/2018 13:07:30,R,Time Bandits,0.5,Nope.,23,Team Humans!,Go,4,10 99 | 4/8/2018 13:07:31,Python,And Now for Something Completely Different,2,Nope.,23,Team Humans!,Chess,4,5 100 | 4/8/2018 13:07:40,Python,Monty Python and the Holy Grail,8,Nope.,41,Team Humans!,Chess,5,3 101 | 4/8/2018 13:07:42,Python,Monty Python and the Holy Grail,1,Yep!,27,Team Humans!,Go,4,8 102 | 4/8/2018 13:07:49,Python,Monty Python and the Holy Grail,1,Nope.,47,Team Humans!,"Tic-tac-toe (Br. Eng. ""noughts and crosses"")",6,7 103 | 4/8/2018 13:07:49,Python,Monty Python and the Holy Grail,5,Nope.,49,Team Humans!,Chess,4,7 104 | 4/8/2018 13:07:51,Python,Monty Python's The Meaning of Life,2,Nope.,32,Team Humans!,Chess,4,7 105 | 4/8/2018 13:08:02,R,Monty Python's Life of Brian,1,Yep!,26,Team Humans!,Longing for the sweet release of death,6,8 106 | 4/8/2018 13:08:15,Python,Time Bandits,1,Yep!,26,Skynet is a WINNER!,Chess,4,8 107 | 4/8/2018 13:08:15,Python,Monty Python and the Holy Grail,1,Yep!,29,Team Humans!,Chess,6,9 108 | 4/8/2018 13:08:30,Python,Monty Python's The Meaning of Life,3,Yep!,30,Team Humans!,Chess,6,5 109 | 4/8/2018 13:09:45,R,Monty Python and the Holy Grail,0.5,Nope.,52,Team Humans!,Chess,6,4 110 | 4/8/2018 13:10:55,Python,And Now for Something Completely Different,7,Yep!,26,Team Humans!,Longing for the sweet release of death,4,6 111 | 4/8/2018 13:11:08,Python,Monty Python and the Holy Grail,0.5,Yep!,27,Skynet is a WINNER!,Go,4,9 112 | 4/8/2018 13:11:18,Python,And Now for Something Completely Different,3,Yep!,30,Team Humans!,Chess,2,7 113 | 4/8/2018 13:12:05,Python,Monty Python and the Holy Grail,1,Nope.,34,Skynet is a WINNER!,Chess,6,6 114 | 4/8/2018 13:12:11,Python,Monty Python's The Meaning of Life,4,Yep!,35,Team Humans!,Go,6,5 115 | 4/8/2018 13:17:36,Python,Time Bandits,3,Yep!,44,Team Humans!,Chess,6,9 116 | 4/8/2018 13:25:43,Python,And Now for Something Completely Different,25,Yep!,76,Team Humans!,Chess,23,1 117 | 4/10/2018 10:37:16,Python,Monty Python's The Meaning of Life,25,Yep!,75,Skynet is a WINNER!,Longing for the sweet release of death,12,10 -------------------------------------------------------------------------------- /data/cancer-pipeline.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/data/cancer-pipeline.pkl -------------------------------------------------------------------------------- /data/pets.csv: -------------------------------------------------------------------------------- 1 | species,sex 2 | cat,M 3 | dog,F 4 | fish,M 5 | dog,M 6 | dog,F 7 | cat,M 8 | fish,M 9 | fish,F 10 | -------------------------------------------------------------------------------- /environment-nographviz.yml: -------------------------------------------------------------------------------- 1 | name: Pearson-ML 2 | channels: 3 | - intel 4 | - conda-forge 5 | dependencies: 6 | - python=3.7 7 | - scikit-learn 8 | - hdbscan 9 | - pandas 10 | - numpy 11 | - matplotlib 12 | - jupyter 13 | - seaborn 14 | - pillow 15 | - pyamg 16 | - jupyterlab 17 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: Pearson-ML 2 | channels: 3 | - intel 4 | - conda-forge 5 | dependencies: 6 | - python=3.7 7 | - scikit-learn 8 | - hdbscan 9 | - pandas 10 | - numpy 11 | - matplotlib 12 | - jupyter 13 | - seaborn 14 | - pillow 15 | - graphviz 16 | - pygraphviz 17 | - pyamg 18 | - jupyterlab 19 | -------------------------------------------------------------------------------- /img/Anscombe_quartet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/Anscombe_quartet.png -------------------------------------------------------------------------------- /img/DBSCAN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/DBSCAN.png -------------------------------------------------------------------------------- /img/DBSCAN.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/DBSCAN.webp -------------------------------------------------------------------------------- /img/DataDino-600x455.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/DataDino-600x455.gif -------------------------------------------------------------------------------- /img/KFold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/KFold.png -------------------------------------------------------------------------------- /img/L1_and_L2_balls.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/L1_and_L2_balls.png -------------------------------------------------------------------------------- /img/ML-Wikipedia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/ML-Wikipedia.png -------------------------------------------------------------------------------- /img/README: -------------------------------------------------------------------------------- 1 | https://dabuttonfactory.com/ 2 | Open Sans 3 | Bold 4 | 16 pt 5 | Black text 6 | Fit to text 7 | Horiz padding: 10 8 | Vert padding: 6 9 | Fully curved sides 10 | Unicolored 11 | Color: #cfe2f3 12 | Bubble Effect 13 | Border 14 | Size: 1 15 | Color: black 16 | No Shadow 17 | 18 | -------------------------------------------------------------------------------- /img/basic-perceptron.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/basic-perceptron.png -------------------------------------------------------------------------------- /img/ca-clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/ca-clusters.png -------------------------------------------------------------------------------- /img/inception-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/inception-v3.png -------------------------------------------------------------------------------- /img/open-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/open-notebook.png -------------------------------------------------------------------------------- /img/pipeline-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/pipeline-diagram.png -------------------------------------------------------------------------------- /img/ridge_regression_geomteric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/ridge_regression_geomteric.png -------------------------------------------------------------------------------- /img/sklearn-topics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/sklearn-topics.png -------------------------------------------------------------------------------- /img/supervised_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/supervised_workflow.png -------------------------------------------------------------------------------- /img/train_test_split_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/img/train_test_split_matrix.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # For Python 3.6+ 2 | scikit-learn 3 | hdbscan 4 | pandas 5 | numpy 6 | matplotlib 7 | jupyter 8 | seaborn 9 | pillow 10 | pyamg 11 | 12 | # This is only the Python bindings for graphviz 13 | # Need to use system installers for graphviz itself 14 | graphviz 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/.ipynb_checkpoints/classifier_comparison-checkpoint.py: -------------------------------------------------------------------------------- 1 | # Code source: Gaël Varoquaux 2 | # Andreas Müller 3 | # Modified for documentation by Jaques Grobler 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from matplotlib.colors import ListedColormap 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.datasets import make_moons, make_circles, make_classification 12 | from sklearn.neural_network import MLPClassifier 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.svm import SVC 15 | from sklearn.gaussian_process import GaussianProcessClassifier 16 | from sklearn.gaussian_process.kernels import RBF 17 | from sklearn.tree import DecisionTreeClassifier 18 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 19 | from sklearn.naive_bayes import GaussianNB 20 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 21 | 22 | h = .02 # step size in the mesh 23 | 24 | names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", 25 | "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", 26 | "Naive Bayes", "QDA"] 27 | 28 | classifiers = [ 29 | KNeighborsClassifier(3), 30 | SVC(kernel="linear", C=0.025), 31 | SVC(gamma=2, C=1), 32 | GaussianProcessClassifier(1.0 * RBF(1.0)), 33 | DecisionTreeClassifier(max_depth=5), 34 | RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 35 | MLPClassifier(alpha=1), 36 | AdaBoostClassifier(), 37 | GaussianNB(), 38 | QuadraticDiscriminantAnalysis()] 39 | 40 | X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, 41 | random_state=1, n_clusters_per_class=1) 42 | rng = np.random.RandomState(2) 43 | X += 2 * rng.uniform(size=X.shape) 44 | linearly_separable = (X, y) 45 | 46 | datasets = [make_moons(noise=0.3, random_state=0), 47 | make_circles(noise=0.2, factor=0.5, random_state=1), 48 | linearly_separable 49 | ] 50 | 51 | figure = plt.figure(figsize=(27, 9)) 52 | i = 1 53 | # iterate over datasets 54 | for ds_cnt, ds in enumerate(datasets): 55 | # preprocess dataset, split into training and test part 56 | X, y = ds 57 | X = StandardScaler().fit_transform(X) 58 | X_train, X_test, y_train, y_test = \ 59 | train_test_split(X, y, test_size=.4, random_state=42) 60 | 61 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 62 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 63 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 64 | np.arange(y_min, y_max, h)) 65 | 66 | # just plot the dataset first 67 | cm = plt.cm.RdBu 68 | cm_bright = ListedColormap(['#FF0000', '#0000FF']) 69 | ax = plt.subplot(len(datasets), len(classifiers) + 1, i) 70 | if ds_cnt == 0: 71 | ax.set_title("Input data") 72 | # Plot the training points 73 | ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, 74 | edgecolors='k') 75 | # Plot the testing points 76 | ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, 77 | edgecolors='k') 78 | ax.set_xlim(xx.min(), xx.max()) 79 | ax.set_ylim(yy.min(), yy.max()) 80 | ax.set_xticks(()) 81 | ax.set_yticks(()) 82 | i += 1 83 | 84 | # iterate over classifiers 85 | for name, clf in zip(names, classifiers): 86 | ax = plt.subplot(len(datasets), len(classifiers) + 1, i) 87 | clf.fit(X_train, y_train) 88 | score = clf.score(X_test, y_test) 89 | 90 | # Plot the decision boundary. For that, we will assign a color to each 91 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 92 | if hasattr(clf, "decision_function"): 93 | Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 94 | else: 95 | Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 96 | 97 | # Put the result into a color plot 98 | Z = Z.reshape(xx.shape) 99 | ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) 100 | 101 | # Plot the training points 102 | ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, 103 | edgecolors='k') 104 | # Plot the testing points 105 | ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, 106 | edgecolors='k', alpha=0.6) 107 | 108 | ax.set_xlim(xx.min(), xx.max()) 109 | ax.set_ylim(yy.min(), yy.max()) 110 | ax.set_xticks(()) 111 | ax.set_yticks(()) 112 | if ds_cnt == 0: 113 | ax.set_title(name) 114 | ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), 115 | size=15, horizontalalignment='right') 116 | i += 1 117 | 118 | plt.tight_layout() 119 | plt.show() -------------------------------------------------------------------------------- /src/.ipynb_checkpoints/decisiontree_regressor-checkpoint.py: -------------------------------------------------------------------------------- 1 | # Import the necessary modules and libraries 2 | import numpy as np 3 | from sklearn.tree import DecisionTreeRegressor 4 | from sklearn.svm import SVR 5 | import matplotlib.pyplot as plt 6 | 7 | # Create a random dataset 8 | rng = np.random.RandomState(1) 9 | X = np.sort(5 * rng.rand(80, 1), axis=0) 10 | y = np.sin(X).ravel() 11 | y[::5] += 3 * (0.5 - rng.rand(16)) 12 | 13 | # Fit regression model 14 | regr_1 = DecisionTreeRegressor(max_depth=2) 15 | regr_2 = DecisionTreeRegressor(max_depth=5) 16 | regr_3 = SVR() 17 | regr_1.fit(X, y) 18 | regr_2.fit(X, y) 19 | regr_3.fit(X, y) 20 | 21 | # Predict 22 | X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] 23 | y_1 = regr_1.predict(X_test) 24 | y_2 = regr_2.predict(X_test) 25 | y_3 = regr_3.predict(X_test) 26 | 27 | # Plot the results 28 | plt.figure(figsize=(10,6)) 29 | plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data") 30 | plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2) 31 | plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) 32 | plt.plot(X_test, y_3, color="orangered", label="SVR", linewidth=2) 33 | plt.xlabel("data") 34 | plt.ylabel("target") 35 | plt.title("Decision Tree Regression (cf. SVR)") 36 | plt.legend() 37 | plt.show() 38 | -------------------------------------------------------------------------------- /src/.ipynb_checkpoints/linear_failure-checkpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | np.random.seed(1) 3 | theta = np.random.random(1000) * tau 4 | r = np.random.random(1000) 5 | feat1 = np.sin(theta) * r 6 | feat2 = np.cos(theta) * r 7 | z = feat1 * feat2 8 | silly_df = pd.DataFrame({"feature_1": feat1, 9 | "feature_2": feat2, 10 | "TARGET": z }) 11 | silly_df.to_csv('../data/linear_failure.csv') -------------------------------------------------------------------------------- /src/.ipynb_checkpoints/plot_cluster_comparison-checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================================================= 3 | Comparing different clustering algorithms on toy datasets 4 | ========================================================= 5 | 6 | This example shows characteristics of different 7 | clustering algorithms on datasets that are "interesting" 8 | but still in 2D. With the exception of the last dataset, 9 | the parameters of each of these dataset-algorithm pairs 10 | has been tuned to produce good clustering results. Some 11 | algorithms are more sensitive to parameter values than 12 | others. 13 | 14 | The last dataset is an example of a 'null' situation for 15 | clustering: the data is homogeneous, and there is no good 16 | clustering. For this example, the null dataset uses the 17 | same parameters as the dataset in the row above it, which 18 | represents a mismatch in the parameter values and the 19 | data structure. 20 | 21 | While these examples give some intuition about the 22 | algorithms, this intuition might not apply to very high 23 | dimensional data. 24 | """ 25 | print(__doc__) 26 | 27 | import time 28 | import warnings 29 | 30 | import numpy as np 31 | import matplotlib.pyplot as plt 32 | 33 | from sklearn import cluster, datasets, mixture 34 | from sklearn.neighbors import kneighbors_graph 35 | from sklearn.preprocessing import StandardScaler 36 | from itertools import cycle, islice 37 | 38 | np.random.seed(0) 39 | 40 | # ============ 41 | # Generate datasets. We choose the size big enough to see the scalability 42 | # of the algorithms, but not too big to avoid too long running times 43 | # ============ 44 | n_samples = 1500 45 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, 46 | noise=.05) 47 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) 48 | blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) 49 | no_structure = np.random.rand(n_samples, 2), None 50 | 51 | # Anisotropicly distributed data 52 | random_state = 170 53 | X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) 54 | transformation = [[0.6, -0.6], [-0.4, 0.8]] 55 | X_aniso = np.dot(X, transformation) 56 | aniso = (X_aniso, y) 57 | 58 | # blobs with varied variances 59 | varied = datasets.make_blobs(n_samples=n_samples, 60 | cluster_std=[1.0, 2.5, 0.5], 61 | random_state=random_state) 62 | 63 | # ============ 64 | # Set up cluster parameters 65 | # ============ 66 | plt.figure(figsize=(9 * 2 + 3, 12.5)) 67 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, 68 | hspace=.01) 69 | 70 | plot_num = 1 71 | 72 | default_base = {'quantile': .3, 73 | 'eps': .3, 74 | 'damping': .9, 75 | 'preference': -200, 76 | 'n_neighbors': 10, 77 | 'n_clusters': 3} 78 | 79 | datasets = [ 80 | (noisy_circles, {'damping': .77, 'preference': -240, 81 | 'quantile': .2, 'n_clusters': 2}), 82 | (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), 83 | (varied, {'eps': .18, 'n_neighbors': 2}), 84 | (aniso, {'eps': .15, 'n_neighbors': 2}), 85 | (blobs, {}), 86 | (no_structure, {})] 87 | 88 | for i_dataset, (dataset, algo_params) in enumerate(datasets): 89 | # update parameters with dataset-specific values 90 | params = default_base.copy() 91 | params.update(algo_params) 92 | 93 | X, y = dataset 94 | 95 | # normalize dataset for easier parameter selection 96 | X = StandardScaler().fit_transform(X) 97 | 98 | # estimate bandwidth for mean shift 99 | bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) 100 | 101 | # connectivity matrix for structured Ward 102 | connectivity = kneighbors_graph( 103 | X, n_neighbors=params['n_neighbors'], include_self=False) 104 | # make connectivity symmetric 105 | connectivity = 0.5 * (connectivity + connectivity.T) 106 | 107 | # ============ 108 | # Create cluster objects 109 | # ============ 110 | ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) 111 | two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) 112 | ward = cluster.AgglomerativeClustering( 113 | n_clusters=params['n_clusters'], linkage='ward', 114 | connectivity=connectivity) 115 | spectral = cluster.SpectralClustering( 116 | n_clusters=params['n_clusters'], eigen_solver='arpack', 117 | affinity="nearest_neighbors") 118 | dbscan = cluster.DBSCAN(eps=params['eps']) 119 | affinity_propagation = cluster.AffinityPropagation( 120 | damping=params['damping'], preference=params['preference']) 121 | average_linkage = cluster.AgglomerativeClustering( 122 | linkage="average", affinity="cityblock", 123 | n_clusters=params['n_clusters'], connectivity=connectivity) 124 | birch = cluster.Birch(n_clusters=params['n_clusters']) 125 | gmm = mixture.GaussianMixture( 126 | n_components=params['n_clusters'], covariance_type='full') 127 | 128 | clustering_algorithms = ( 129 | ('MiniBatchKMeans', two_means), 130 | ('AffinityPropagation', affinity_propagation), 131 | ('MeanShift', ms), 132 | ('SpectralClustering', spectral), 133 | ('Ward', ward), 134 | ('AgglomerativeClustering', average_linkage), 135 | ('DBSCAN', dbscan), 136 | ('Birch', birch), 137 | ('GaussianMixture', gmm) 138 | ) 139 | 140 | for name, algorithm in clustering_algorithms: 141 | t0 = time.time() 142 | 143 | # catch warnings related to kneighbors_graph 144 | with warnings.catch_warnings(): 145 | warnings.filterwarnings( 146 | "ignore", 147 | message="the number of connected components of the " + 148 | "connectivity matrix is [0-9]{1,2}" + 149 | " > 1. Completing it to avoid stopping the tree early.", 150 | category=UserWarning) 151 | warnings.filterwarnings( 152 | "ignore", 153 | message="Graph is not fully connected, spectral embedding" + 154 | " may not work as expected.", 155 | category=UserWarning) 156 | algorithm.fit(X) 157 | 158 | t1 = time.time() 159 | if hasattr(algorithm, 'labels_'): 160 | y_pred = algorithm.labels_.astype(np.int) 161 | else: 162 | y_pred = algorithm.predict(X) 163 | 164 | plt.subplot(len(datasets), len(clustering_algorithms), plot_num) 165 | if i_dataset == 0: 166 | plt.title(name, size=18) 167 | 168 | colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', 169 | '#f781bf', '#a65628', '#984ea3', 170 | '#999999', '#e41a1c', '#dede00']), 171 | int(max(y_pred) + 1)))) 172 | # add black color for outliers (if any) 173 | colors = np.append(colors, ["#000000"]) 174 | plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) 175 | 176 | plt.xlim(-2.5, 2.5) 177 | plt.ylim(-2.5, 2.5) 178 | plt.xticks(()) 179 | plt.yticks(()) 180 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), 181 | transform=plt.gca().transAxes, size=15, 182 | horizontalalignment='right') 183 | plot_num += 1 184 | 185 | plt.show() 186 | -------------------------------------------------------------------------------- /src/.ipynb_checkpoints/time_regressors-checkpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn import datasets 4 | california = datasets.california_housing.fetch_california_housing() 5 | X, y = clafornia.data, california.target 6 | 7 | from time import time 8 | 9 | from sklearn.neighbors import KNeighborsRegressor 10 | from sklearn.linear_model import LinearRegression, RANSACRegressor 11 | from sklearn.gaussian_process import GaussianProcessRegressor 12 | from sklearn.svm import SVR 13 | from sklearn.svm import LinearSVR 14 | 15 | regressors = [ 16 | LinearRegression(), 17 | RANSACRegressor(), 18 | KNeighborsRegressor(), 19 | KNeighborsRegressor(n_neighbors=9, metric='manhattan'), 20 | SVR(), 21 | LinearSVR(), 22 | SVR(kernel='linear'), # Cf. LinearSVR: much slower, might be better or worse: 23 | GaussianProcessRegressor(), 24 | ] 25 | 26 | from sklearn.metrics import explained_variance_score 27 | from sklearn.metrics import mean_absolute_error 28 | from sklearn.metrics import r2_score 29 | 30 | for model in regressors: 31 | start = time() 32 | model.fit(X_train, y_train) 33 | train_time = time() - start 34 | start = time() 35 | predictions = model.predict(X_test) 36 | predict_time = time()-start 37 | print(model) 38 | print("\tTraining time: %0.3fs\n\tFitting time: %0.3fs" % (train_time, predict_time)) 39 | print("\tExplained variance:", explained_variance_score(y_test, predictions)) 40 | print("\tMean absolute error:", mean_absolute_error(y_test, predictions)) 41 | print("\tR2 score:", r2_score(y_test, predictions)) 42 | -------------------------------------------------------------------------------- /src/.ropeproject/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | # flake8: noqa 3 | 4 | 5 | def set_prefs(prefs): 6 | """This function is called before opening the project""" 7 | 8 | # Specify which files and folders to ignore in the project. 9 | # Changes to ignored resources are not added to the history and 10 | # VCSs. Also they are not returned in `Project.get_files()`. 11 | # Note that ``?`` and ``*`` match all characters but slashes. 12 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 13 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 14 | # '.svn': matches 'pkg/.svn' and all of its children 15 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 16 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 17 | prefs['ignored_resources'] = [ 18 | '*.pyc', '*~', '.ropeproject', '.hg', '.svn', '_svn', 19 | '.git', '.tox', '.env', 'env', 'venv', 'node_modules', 20 | 'bower_components' 21 | ] 22 | 23 | # Specifies which files should be considered python files. It is 24 | # useful when you have scripts inside your project. Only files 25 | # ending with ``.py`` are considered to be python files by 26 | # default. 27 | #prefs['python_files'] = ['*.py'] 28 | 29 | # Custom source folders: By default rope searches the project 30 | # for finding source folders (folders that should be searched 31 | # for finding modules). You can add paths to that list. Note 32 | # that rope guesses project source folders correctly most of the 33 | # time; use this if you have any problems. 34 | # The folders should be relative to project root and use '/' for 35 | # separating folders regardless of the platform rope is running on. 36 | # 'src/my_source_folder' for instance. 37 | #prefs.add('source_folders', 'src') 38 | 39 | # You can extend python path for looking up modules 40 | #prefs.add('python_path', '~/python/') 41 | 42 | # Should rope save object information or not. 43 | prefs['save_objectdb'] = True 44 | prefs['compress_objectdb'] = False 45 | 46 | # If `True`, rope analyzes each module when it is being saved. 47 | prefs['automatic_soa'] = True 48 | # The depth of calls to follow in static object analysis 49 | prefs['soa_followed_calls'] = 0 50 | 51 | # If `False` when running modules or unit tests "dynamic object 52 | # analysis" is turned off. This makes them much faster. 53 | prefs['perform_doa'] = True 54 | 55 | # Rope can check the validity of its object DB when running. 56 | prefs['validate_objectdb'] = True 57 | 58 | # How many undos to hold? 59 | prefs['max_history_items'] = 32 60 | 61 | # Shows whether to save history across sessions. 62 | prefs['save_history'] = True 63 | prefs['compress_history'] = False 64 | 65 | # Set the number spaces used for indenting. According to 66 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 67 | # unit-tests use 4 spaces it is more reliable, too. 68 | prefs['indent_size'] = 4 69 | 70 | # Builtin and c-extension modules that are allowed to be imported 71 | # and inspected by rope. 72 | prefs['extension_modules'] = [] 73 | 74 | # Add all standard c-extensions to extension_modules list. 75 | prefs['import_dynload_stdmods'] = True 76 | 77 | # If `True` modules with syntax errors are considered to be empty. 78 | # The default value is `False`; When `False` syntax errors raise 79 | # `rope.base.exceptions.ModuleSyntaxError` exception. 80 | prefs['ignore_syntax_errors'] = False 81 | 82 | # If `True`, rope ignores unresolvable imports. Otherwise, they 83 | # appear in the importing namespace. 84 | prefs['ignore_bad_imports'] = False 85 | 86 | # If `True`, rope will insert new module imports as 87 | # `from import ` by default. 88 | prefs['prefer_module_from_imports'] = False 89 | 90 | # If `True`, rope will transform a comma list of imports into 91 | # multiple separate import statements when organizing 92 | # imports. 93 | prefs['split_imports'] = False 94 | 95 | # If `True`, rope will sort imports alphabetically by module name 96 | # instead of alphabetically by import statement, with from imports 97 | # after normal imports. 98 | prefs['sort_imports_alphabetically'] = False 99 | 100 | 101 | def project_opened(project): 102 | """This function is called after opening the project""" 103 | # Do whatever you like here! 104 | -------------------------------------------------------------------------------- /src/.ropeproject/globalnames: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/src/.ropeproject/globalnames -------------------------------------------------------------------------------- /src/.ropeproject/history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/src/.ropeproject/history -------------------------------------------------------------------------------- /src/.ropeproject/objectdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/src/.ropeproject/objectdb -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavidMertz/ML-Live-Intermediate/a30e1ca84fdab4b84a426a46bb99b20deeec03df/src/__init__.py -------------------------------------------------------------------------------- /src/classifier_comparison.py: -------------------------------------------------------------------------------- 1 | # Code source: Gaël Varoquaux 2 | # Andreas Müller 3 | # Modified for documentation by Jaques Grobler 4 | # License: BSD 3 clause 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from matplotlib.colors import ListedColormap 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.datasets import make_moons, make_circles, make_classification 12 | from sklearn.neural_network import MLPClassifier 13 | from sklearn.neighbors import KNeighborsClassifier 14 | from sklearn.svm import SVC 15 | from sklearn.gaussian_process import GaussianProcessClassifier 16 | from sklearn.gaussian_process.kernels import RBF 17 | from sklearn.tree import DecisionTreeClassifier 18 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 19 | from sklearn.naive_bayes import GaussianNB 20 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 21 | 22 | h = .02 # step size in the mesh 23 | 24 | names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", 25 | "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", 26 | "Naive Bayes", "QDA"] 27 | 28 | classifiers = [ 29 | KNeighborsClassifier(3), 30 | SVC(kernel="linear", C=0.025), 31 | SVC(gamma=2, C=1), 32 | GaussianProcessClassifier(1.0 * RBF(1.0)), 33 | DecisionTreeClassifier(max_depth=5), 34 | RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 35 | MLPClassifier(alpha=1), 36 | AdaBoostClassifier(), 37 | GaussianNB(), 38 | QuadraticDiscriminantAnalysis()] 39 | 40 | X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, 41 | random_state=1, n_clusters_per_class=1) 42 | rng = np.random.RandomState(2) 43 | X += 2 * rng.uniform(size=X.shape) 44 | linearly_separable = (X, y) 45 | 46 | datasets = [make_moons(noise=0.3, random_state=0), 47 | make_circles(noise=0.2, factor=0.5, random_state=1), 48 | linearly_separable 49 | ] 50 | 51 | figure = plt.figure(figsize=(27, 9)) 52 | i = 1 53 | # iterate over datasets 54 | for ds_cnt, ds in enumerate(datasets): 55 | # preprocess dataset, split into training and test part 56 | X, y = ds 57 | X = StandardScaler().fit_transform(X) 58 | X_train, X_test, y_train, y_test = \ 59 | train_test_split(X, y, test_size=.4, random_state=42) 60 | 61 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 62 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 63 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 64 | np.arange(y_min, y_max, h)) 65 | 66 | # just plot the dataset first 67 | cm = plt.cm.RdBu 68 | cm_bright = ListedColormap(['#FF0000', '#0000FF']) 69 | ax = plt.subplot(len(datasets), len(classifiers) + 1, i) 70 | if ds_cnt == 0: 71 | ax.set_title("Input data") 72 | # Plot the training points 73 | ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, 74 | edgecolors='k') 75 | # Plot the testing points 76 | ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, 77 | edgecolors='k') 78 | ax.set_xlim(xx.min(), xx.max()) 79 | ax.set_ylim(yy.min(), yy.max()) 80 | ax.set_xticks(()) 81 | ax.set_yticks(()) 82 | i += 1 83 | 84 | # iterate over classifiers 85 | for name, clf in zip(names, classifiers): 86 | ax = plt.subplot(len(datasets), len(classifiers) + 1, i) 87 | clf.fit(X_train, y_train) 88 | score = clf.score(X_test, y_test) 89 | 90 | # Plot the decision boundary. For that, we will assign a color to each 91 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 92 | if hasattr(clf, "decision_function"): 93 | Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 94 | else: 95 | Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 96 | 97 | # Put the result into a color plot 98 | Z = Z.reshape(xx.shape) 99 | ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) 100 | 101 | # Plot the training points 102 | ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, 103 | edgecolors='k') 104 | # Plot the testing points 105 | ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, 106 | edgecolors='k', alpha=0.6) 107 | 108 | ax.set_xlim(xx.min(), xx.max()) 109 | ax.set_ylim(yy.min(), yy.max()) 110 | ax.set_xticks(()) 111 | ax.set_yticks(()) 112 | if ds_cnt == 0: 113 | ax.set_title(name) 114 | ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), 115 | size=15, horizontalalignment='right') 116 | i += 1 117 | 118 | plt.tight_layout() 119 | plt.show() -------------------------------------------------------------------------------- /src/decisiontree_regressor.py: -------------------------------------------------------------------------------- 1 | # Import the necessary modules and libraries 2 | import numpy as np 3 | from sklearn.tree import DecisionTreeRegressor 4 | from sklearn.svm import SVR 5 | import matplotlib.pyplot as plt 6 | 7 | # Create a random dataset 8 | rng = np.random.RandomState(1) 9 | X = np.sort(5 * rng.rand(80, 1), axis=0) 10 | y = np.sin(X).ravel() 11 | y[::5] += 3 * (0.5 - rng.rand(16)) 12 | 13 | # Fit regression model 14 | regr_1 = DecisionTreeRegressor(max_depth=2) 15 | regr_2 = DecisionTreeRegressor(max_depth=5) 16 | regr_3 = SVR() 17 | regr_1.fit(X, y) 18 | regr_2.fit(X, y) 19 | regr_3.fit(X, y) 20 | 21 | # Predict 22 | X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] 23 | y_1 = regr_1.predict(X_test) 24 | y_2 = regr_2.predict(X_test) 25 | y_3 = regr_3.predict(X_test) 26 | 27 | # Plot the results 28 | plt.figure(figsize=(10,6)) 29 | plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data") 30 | plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2) 31 | plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) 32 | plt.plot(X_test, y_3, color="orangered", label="SVR", linewidth=2) 33 | plt.xlabel("data") 34 | plt.ylabel("target") 35 | plt.title("Decision Tree Regression (cf. SVR)") 36 | plt.legend() 37 | plt.show() 38 | -------------------------------------------------------------------------------- /src/fakify_humans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from os.path import join 3 | import pandas as pd 4 | 5 | # Read the data 6 | fname = join('..', 'data', "Learning about Humans learning ML.csv") 7 | humans = pd.read_csv(fname) 8 | 9 | # Drop unused column 10 | humans.drop('Timestamp', axis=1, inplace=True) 11 | 12 | # Add an improved column 13 | humans['Education'] = (humans[ 14 | 'Years of post-secondary education (e.g. BA=4; Ph.D.=10)'] 15 | .str.replace(r'.*=','') 16 | .astype(int)) 17 | 18 | # Then drop the one it is based on 19 | humans.drop('Years of post-secondary education (e.g. BA=4; Ph.D.=10)', 20 | axis=1, inplace=True) 21 | 22 | # Simplify the column names 23 | humans.columns = ['Fav_lang', 'Fav_movie', 'Experience', 'Sklearn', 24 | 'Age', 'Humans_Machines', 'Fav_Game', 'Success', 'Education'] 25 | 26 | # Get the dummies 27 | human_dummies = pd.get_dummies(humans) 28 | 29 | # Replicate the data 50 times 30 | more_dummies = pd.concat([human_dummies]*50) 31 | more_dummies.index = range(len(more_dummies)) 32 | 33 | # Bias the mild positive predictor movie=something different 34 | samp = more_dummies.sample(frac=0.75) 35 | ndx = samp[samp['Fav_movie_And Now for Something Completely Different']==1].index 36 | more_dummies.loc[ndx, 'Success'] = 11 37 | 38 | # Bias the mild negative predictor move=holy grail 39 | samp = more_dummies.sample(frac=0.75) 40 | ndx = samp[samp['Fav_movie_Monty Python and the Holy Grail']==1].index 41 | more_dummies.loc[ndx, 'Success'] = -1 42 | 43 | # Bias the mild negative predictor age>38 44 | samp = more_dummies.sample(frac=0.75) 45 | ndx = samp[samp['Age']> 38].index 46 | more_dummies.loc[ndx, 'Success'] = -2 47 | 48 | # Bias the mild positive predictor game=chass 49 | samp = more_dummies.sample(frac=0.75) 50 | ndx = samp[samp['Fav_Game_Chess']==1].index 51 | more_dummies.loc[ndx, 'Success'] = 12 52 | 53 | # Boolean at cutoff value 54 | more_dummies['Success'] = more_dummies.Success >= 8 55 | 56 | # Write the data 57 | out = join('..', 'data', "FakeLearning.csv") 58 | more_dummies.to_csv(out) 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/linear_failure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | np.random.seed(1) 3 | theta = np.random.random(1000) * tau 4 | r = np.random.random(1000) 5 | feat1 = np.sin(theta) * r 6 | feat2 = np.cos(theta) * r 7 | z = feat1 * feat2 8 | silly_df = pd.DataFrame({"feature_1": feat1, 9 | "feature_2": feat2, 10 | "TARGET": z }) 11 | silly_df.to_csv('../data/linear_failure.csv') -------------------------------------------------------------------------------- /src/mglearn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import plots 2 | from . import tools 3 | from .plots import cm3, cm2 4 | 5 | __all__ = ['tools', 'plots', 'cm3', 'cm2'] 6 | -------------------------------------------------------------------------------- /src/mglearn/datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_boston 4 | from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures 5 | from .make_blobs import make_blobs 6 | 7 | 8 | def make_forge(): 9 | # a carefully hand-designed dataset lol 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | y[np.array([7, 27])] = 0 12 | mask = np.ones(len(X), dtype=np.bool) 13 | mask[np.array([0, 1, 5, 26])] = 0 14 | X, y = X[mask], y[mask] 15 | return X, y 16 | 17 | 18 | def make_wave(n_samples=100): 19 | rnd = np.random.RandomState(42) 20 | x = rnd.uniform(-3, 3, size=n_samples) 21 | y_no_noise = (np.sin(4 * x) + x) 22 | y = (y_no_noise + rnd.normal(size=len(x))) / 2 23 | return x.reshape(-1, 1), y 24 | 25 | 26 | def load_extended_boston(): 27 | boston = load_boston() 28 | X = boston.data 29 | 30 | X = MinMaxScaler().fit_transform(boston.data) 31 | X = PolynomialFeatures(degree=2).fit_transform(X) 32 | return X, boston.target 33 | 34 | 35 | def load_citibike(): 36 | data_mine = pd.read_csv("data/citibike.csv") 37 | data_mine['one'] = 1 38 | data_mine['starttime'] = pd.to_datetime(data_mine.starttime) 39 | data_starttime = data_mine.set_index("starttime") 40 | data_resampled = data_starttime.resample("3h", how="sum").fillna(0) 41 | return data_resampled.one 42 | -------------------------------------------------------------------------------- /src/mglearn/make_blobs.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import numpy as np 3 | 4 | from sklearn.utils import check_array, check_random_state 5 | from sklearn.utils import shuffle as shuffle_ 6 | 7 | 8 | def make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=1.0, 9 | center_box=(-10.0, 10.0), shuffle=True, random_state=None): 10 | """Generate isotropic Gaussian blobs for clustering. 11 | 12 | Read more in the :ref:`User Guide `. 13 | 14 | Parameters 15 | ---------- 16 | n_samples : int, or tuple, optional (default=100) 17 | The total number of points equally divided among clusters. 18 | 19 | n_features : int, optional (default=2) 20 | The number of features for each sample. 21 | 22 | centers : int or array of shape [n_centers, n_features], optional 23 | (default=3) 24 | The number of centers to generate, or the fixed center locations. 25 | 26 | cluster_std: float or sequence of floats, optional (default=1.0) 27 | The standard deviation of the clusters. 28 | 29 | center_box: pair of floats (min, max), optional (default=(-10.0, 10.0)) 30 | The bounding box for each cluster center when centers are 31 | generated at random. 32 | 33 | shuffle : boolean, optional (default=True) 34 | Shuffle the samples. 35 | 36 | random_state : int, RandomState instance or None, optional (default=None) 37 | If int, random_state is the seed used by the random number generator; 38 | If RandomState instance, random_state is the random number generator; 39 | If None, the random number generator is the RandomState instance used 40 | by `np.random`. 41 | 42 | Returns 43 | ------- 44 | X : array of shape [n_samples, n_features] 45 | The generated samples. 46 | 47 | y : array of shape [n_samples] 48 | The integer labels for cluster membership of each sample. 49 | 50 | Examples 51 | -------- 52 | >>> from sklearn.datasets.samples_generator import make_blobs 53 | >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2, 54 | ... random_state=0) 55 | >>> print(X.shape) 56 | (10, 2) 57 | >>> y 58 | array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0]) 59 | 60 | See also 61 | -------- 62 | make_classification: a more intricate variant 63 | """ 64 | generator = check_random_state(random_state) 65 | 66 | if isinstance(centers, numbers.Integral): 67 | centers = generator.uniform(center_box[0], center_box[1], 68 | size=(centers, n_features)) 69 | else: 70 | centers = check_array(centers) 71 | n_features = centers.shape[1] 72 | 73 | if isinstance(cluster_std, numbers.Real): 74 | cluster_std = np.ones(len(centers)) * cluster_std 75 | 76 | X = [] 77 | y = [] 78 | 79 | n_centers = centers.shape[0] 80 | if isinstance(n_samples, numbers.Integral): 81 | n_samples_per_center = [int(n_samples // n_centers)] * n_centers 82 | for i in range(n_samples % n_centers): 83 | n_samples_per_center[i] += 1 84 | else: 85 | n_samples_per_center = n_samples 86 | 87 | for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)): 88 | X.append(centers[i] + generator.normal(scale=std, 89 | size=(n, n_features))) 90 | y += [i] * n 91 | 92 | X = np.concatenate(X) 93 | y = np.array(y) 94 | 95 | if shuffle: 96 | X, y = shuffle_(X, y, random_state=generator) 97 | 98 | return X, y 99 | -------------------------------------------------------------------------------- /src/mglearn/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from .plot_helpers import cm2 4 | 5 | 6 | def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, alpha=1, cm=None): 7 | # multiclass 8 | if eps is None: 9 | eps = X.std() / 2. 10 | 11 | if ax is None: 12 | ax = plt.gca() 13 | 14 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 15 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 16 | xx = np.linspace(x_min, x_max, 1000) 17 | yy = np.linspace(y_min, y_max, 1000) 18 | 19 | X1, X2 = np.meshgrid(xx, yy) 20 | X_grid = np.c_[X1.ravel(), X2.ravel()] 21 | decision_values = classifier.predict(X_grid) 22 | ax.imshow(decision_values.reshape(X1.shape), extent=(x_min, x_max, 23 | y_min, y_max), 24 | aspect='auto', origin='lower', alpha=alpha, cmap=cm) 25 | ax.set_xlim(x_min, x_max) 26 | ax.set_ylim(y_min, y_max) 27 | ax.set_xticks(()) 28 | ax.set_yticks(()) 29 | 30 | 31 | def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm=None, function=None): 32 | # binary with fill 33 | if eps is None: 34 | eps = X.std() / 2. 35 | 36 | if ax is None: 37 | ax = plt.gca() 38 | 39 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 40 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 41 | xx = np.linspace(x_min, x_max, 100) 42 | yy = np.linspace(y_min, y_max, 100) 43 | 44 | X1, X2 = np.meshgrid(xx, yy) 45 | X_grid = np.c_[X1.ravel(), X2.ravel()] 46 | if function is None: 47 | function = getattr(classifier, "decision_function", getattr(classifier, "predict_proba")) 48 | else: 49 | function = getattr(classifier, function) 50 | decision_values = function(X_grid) 51 | if decision_values.ndim > 1 and decision_values.shape[1] > 1: 52 | # predict_proba 53 | decision_values = decision_values[:, 1] 54 | grr = ax.imshow(decision_values.reshape(X1.shape), 55 | extent=(x_min, x_max, y_min, y_max), aspect='auto', 56 | origin='lower', alpha=alpha, cmap=cm) 57 | 58 | ax.set_xlim(x_min, x_max) 59 | ax.set_ylim(y_min, y_max) 60 | ax.set_xticks(()) 61 | ax.set_yticks(()) 62 | return grr 63 | 64 | 65 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, 66 | cm=cm2, linewidth=None, threshold=None): 67 | # binary? 68 | if eps is None: 69 | eps = X.std() / 2. 70 | 71 | if ax is None: 72 | ax = plt.gca() 73 | 74 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 75 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 76 | xx = np.linspace(x_min, x_max, 100) 77 | yy = np.linspace(y_min, y_max, 100) 78 | 79 | X1, X2 = np.meshgrid(xx, yy) 80 | X_grid = np.c_[X1.ravel(), X2.ravel()] 81 | try: 82 | decision_values = classifier.decision_function(X_grid) 83 | levels = [0] if threshold is None else [threshold] 84 | fill_levels = [decision_values.min()] + levels + [decision_values.max()] 85 | except AttributeError: 86 | # no decision_function 87 | decision_values = classifier.predict_proba(X_grid)[:, 1] 88 | levels = [.5] if threshold is None else [threshold] 89 | fill_levels = [0] + levels + [1] 90 | if fill: 91 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 92 | levels=fill_levels, alpha=alpha, cmap=cm) 93 | else: 94 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 95 | colors="black", alpha=alpha, linewidths=linewidth, 96 | linestyles="solid") 97 | 98 | ax.set_xlim(x_min, x_max) 99 | ax.set_ylim(y_min, y_max) 100 | ax.set_xticks(()) 101 | ax.set_yticks(()) 102 | 103 | 104 | if __name__ == '__main__': 105 | from sklearn.datasets import make_blobs 106 | from sklearn.linear_model import LogisticRegression 107 | X, y = make_blobs(centers=2, random_state=42) 108 | clf = LogisticRegression().fit(X, y) 109 | plot_2d_separator(clf, X, fill=True) 110 | plt.scatter(X[:, 0], X[:, 1], c=y, s=60) 111 | plt.show() 112 | -------------------------------------------------------------------------------- /src/mglearn/plot_agglomerative.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.datasets import make_blobs 4 | from sklearn.cluster import AgglomerativeClustering 5 | from sklearn.neighbors import KernelDensity 6 | 7 | 8 | def plot_agglomerative_algorithm(): 9 | # generate synthetic two-dimensional data 10 | X, y = make_blobs(random_state=0, n_samples=12) 11 | 12 | agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X) 13 | 14 | fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (), 15 | 'yticks': ()}, 16 | figsize=(20, 8)) 17 | 18 | eps = X.std() / 2 19 | 20 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 21 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 22 | 23 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) 24 | gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] 25 | 26 | for i, ax in enumerate(axes.ravel()): 27 | ax.set_xlim(x_min, x_max) 28 | ax.set_ylim(y_min, y_max) 29 | agg.n_clusters = X.shape[0] - i 30 | agg.fit(X) 31 | ax.set_title("Step %d" % i) 32 | ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') 33 | bins = np.bincount(agg.labels_) 34 | for cluster in range(agg.n_clusters): 35 | if bins[cluster] > 1: 36 | points = X[agg.labels_ == cluster] 37 | other_points = X[agg.labels_ != cluster] 38 | 39 | kde = KernelDensity(bandwidth=.5).fit(points) 40 | scores = kde.score_samples(gridpoints) 41 | score_inside = np.min(kde.score_samples(points)) 42 | score_outside = np.max(kde.score_samples(other_points)) 43 | levels = .8 * score_inside + .2 * score_outside 44 | ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], 45 | colors='k', linestyles='solid', linewidths=2) 46 | 47 | axes[0, 0].set_title("Initialization") 48 | 49 | 50 | def plot_agglomerative(): 51 | X, y = make_blobs(random_state=0, n_samples=12) 52 | agg = AgglomerativeClustering(n_clusters=3) 53 | 54 | eps = X.std() / 2. 55 | 56 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 57 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 58 | 59 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) 60 | gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] 61 | 62 | ax = plt.gca() 63 | for i, x in enumerate(X): 64 | ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center') 65 | 66 | ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') 67 | ax.set_xticks(()) 68 | ax.set_yticks(()) 69 | 70 | for i in range(11): 71 | agg.n_clusters = X.shape[0] - i 72 | agg.fit(X) 73 | 74 | bins = np.bincount(agg.labels_) 75 | for cluster in range(agg.n_clusters): 76 | if bins[cluster] > 1: 77 | points = X[agg.labels_ == cluster] 78 | other_points = X[agg.labels_ != cluster] 79 | 80 | kde = KernelDensity(bandwidth=.5).fit(points) 81 | scores = kde.score_samples(gridpoints) 82 | score_inside = np.min(kde.score_samples(points)) 83 | score_outside = np.max(kde.score_samples(other_points)) 84 | levels = .8 * score_inside + .2 * score_outside 85 | ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], 86 | colors='k', linestyles='solid', linewidths=1) 87 | 88 | ax.set_xlim(x_min, x_max) 89 | ax.set_ylim(y_min, y_max) 90 | -------------------------------------------------------------------------------- /src/mglearn/plot_animal_tree.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_animal_tree(ax=None): 6 | from scipy.misc import imread 7 | import graphviz 8 | if ax is None: 9 | ax = plt.gca() 10 | mygraph = graphviz.Digraph(node_attr={'shape': 'box'}, 11 | edge_attr={'labeldistance': "10.5"}, 12 | format="png") 13 | mygraph.node("0", "Has feathers?") 14 | mygraph.node("1", "Can fly?") 15 | mygraph.node("2", "Has finns?") 16 | mygraph.node("3", "Hawk") 17 | mygraph.node("4", "Penguin") 18 | mygraph.node("5", "Dolphin") 19 | mygraph.node("6", "Bear") 20 | mygraph.edge("0", "1", label="True") 21 | mygraph.edge("0", "2", label="False") 22 | mygraph.edge("1", "3", label="True") 23 | mygraph.edge("1", "4", label="False") 24 | mygraph.edge("2", "5", label="True") 25 | mygraph.edge("2", "6", label="False") 26 | mygraph.render("tmp") 27 | ax.imshow(imread("tmp.png")) 28 | ax.set_axis_off() 29 | -------------------------------------------------------------------------------- /src/mglearn/plot_cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_label_kfold(): 6 | from sklearn.cross_validation import LabelKFold 7 | labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] 8 | 9 | plt.figure(figsize=(10, 2)) 10 | plt.title("LabelKFold") 11 | 12 | axes = plt.gca() 13 | axes.set_frame_on(False) 14 | 15 | n_folds = 12 16 | n_samples = 12 17 | n_iter = 3 18 | n_samples_per_fold = 1 19 | 20 | cv = LabelKFold(n_folds=3, labels=labels) 21 | mask = np.zeros((n_iter, n_samples)) 22 | for i, (train, test) in enumerate(cv.split(range(12))): 23 | mask[i, train] = 1 24 | mask[i, test] = 2 25 | 26 | 27 | for i in range(n_folds): 28 | # test is grey 29 | colors = ["grey" if x == 2 else "white" for x in mask[:, i]] 30 | # not selected has no hatch 31 | 32 | boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") 33 | for j in np.where(mask[:, i] == 0)[0]: 34 | boxes[j].set_hatch("") 35 | 36 | axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w") 37 | 38 | for i in range(12): 39 | axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % labels[i], horizontalalignment="center") 40 | #ax.set_ylim(4, -0.1) 41 | 42 | axes.invert_yaxis() 43 | axes.set_xlim(0, n_samples + 1) 44 | axes.set_ylabel("CV iterations") 45 | axes.set_xlabel("Data points") 46 | axes.set_xticks(np.arange(n_samples) + .5) 47 | axes.set_xticklabels(np.arange(1, n_samples + 1)) 48 | axes.set_yticks(np.arange(n_iter) + .3) 49 | axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)] + ["labels"]); 50 | plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)); 51 | plt.tight_layout() 52 | 53 | 54 | 55 | def plot_shuffle_split(): 56 | from sklearn.model_selection import ShuffleSplit 57 | plt.figure(figsize=(10, 2)) 58 | plt.title("ShuffleSplit with 10 points, train_size=5, test_size=2, n_splits=4") 59 | 60 | axes = plt.gca() 61 | axes.set_frame_on(False) 62 | 63 | n_folds = 10 64 | n_samples = 10 65 | n_splits = 4 66 | n_samples_per_fold = 1 67 | 68 | ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43) 69 | mask = np.zeros((n_splits, n_samples)) 70 | for i, (train, test) in enumerate(ss.split(range(10))): 71 | mask[i, train] = 1 72 | mask[i, test] = 2 73 | 74 | 75 | for i in range(n_folds): 76 | # test is grey 77 | colors = ["grey" if x == 2 else "white" for x in mask[:, i]] 78 | # not selected has no hatch 79 | 80 | boxes = axes.barh(bottom=range(n_splits), width=[1 - 0.1] * n_splits, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") 81 | for j in np.where(mask[:, i] == 0)[0]: 82 | boxes[j].set_hatch("") 83 | 84 | axes.invert_yaxis() 85 | axes.set_xlim(0, n_samples + 1) 86 | axes.set_ylabel("CV iterations") 87 | axes.set_xlabel("Data points") 88 | axes.set_xticks(np.arange(n_samples) + .5) 89 | axes.set_xticklabels(np.arange(1, n_samples + 1)) 90 | axes.set_yticks(np.arange(n_splits) + .3) 91 | axes.set_yticklabels(["Split %d" % x for x in range(1, n_splits + 1)]); 92 | # legend hacked for this random state 93 | plt.legend([boxes[1], boxes[0], boxes[2]], ["Training set", "Test set", "Not selected"], loc=(1, .3)); 94 | plt.tight_layout() 95 | plt.close() 96 | 97 | 98 | def plot_stratified_cross_validation(): 99 | fig, both_axes = plt.subplots(2, 1, figsize=(12, 5)) 100 | #plt.title("cross_validation_not_stratified") 101 | axes = both_axes[0] 102 | axes.set_title("Standard cross-validation with sorted class labels") 103 | 104 | axes.set_frame_on(False) 105 | 106 | n_folds = 3 107 | n_samples = 150 108 | 109 | n_samples_per_fold = n_samples / float(n_folds) 110 | 111 | 112 | for i in range(n_folds): 113 | colors = ["w"] * n_folds 114 | colors[i] = "grey" 115 | axes.barh(bottom=range(n_folds), width=[n_samples_per_fold - 1] * n_folds, left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") 116 | 117 | axes.barh(bottom=[n_folds] * n_folds, width=[n_samples_per_fold - 1] * n_folds, left=np.arange(3) * n_samples_per_fold, height=.6, color="w") 118 | 119 | axes.invert_yaxis() 120 | axes.set_xlim(0, n_samples + 1) 121 | axes.set_ylabel("CV iterations") 122 | axes.set_xlabel("Data points") 123 | axes.set_xticks(np.arange(n_samples_per_fold / 2., n_samples, n_samples_per_fold)) 124 | axes.set_xticklabels(["Fold %d" % x for x in range(1, n_folds + 1)]) 125 | axes.set_yticks(np.arange(n_folds + 1) + .3) 126 | axes.set_yticklabels(["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"]) 127 | for i in range(3): 128 | axes.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" % i, horizontalalignment="center") 129 | 130 | 131 | ax = both_axes[1] 132 | ax.set_title("Stratified Cross-validation") 133 | ax.set_frame_on(False) 134 | ax.invert_yaxis() 135 | ax.set_xlim(0, n_samples + 1) 136 | ax.set_ylabel("CV iterations") 137 | ax.set_xlabel("Data points") 138 | 139 | ax.set_yticks(np.arange(n_folds + 1) + .3) 140 | ax.set_yticklabels(["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"]); 141 | 142 | n_subsplit = n_samples_per_fold / 3. 143 | for i in range(n_folds): 144 | test_bars = ax.barh(bottom=[i] * n_folds, width=[n_subsplit - 1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold + i * n_subsplit, height=.6, color="grey", hatch="//") 145 | 146 | w = 2 * n_subsplit - 1 147 | ax.barh(bottom=[0] * n_folds, width=[w] * n_folds, left=np.arange(n_folds) * n_samples_per_fold + (0 + 1) * n_subsplit, height=.6, color="w", hatch="//") 148 | ax.barh(bottom=[1] * (n_folds + 1), width=[w / 2., w, w, w / 2.], left=np.maximum(0, np.arange(n_folds + 1) * n_samples_per_fold - n_subsplit), height=.6, color="w", hatch="//") 149 | training_bars = ax.barh(bottom=[2] * n_folds, width=[w] * n_folds, left=np.arange(n_folds) * n_samples_per_fold , height=.6, color="w", hatch="//") 150 | 151 | 152 | ax.barh(bottom=[n_folds] * n_folds, width=[n_samples_per_fold - 1] * n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, color="w") 153 | 154 | for i in range(3): 155 | ax.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" % i, horizontalalignment="center") 156 | ax.set_ylim(4, -0.1) 157 | plt.legend([training_bars[0], test_bars[0]], ['Training data', 'Test data'], loc=(1.05, 1), frameon=False); 158 | 159 | fig.tight_layout() 160 | 161 | 162 | def plot_cross_validation(): 163 | plt.figure(figsize=(12, 2)) 164 | plt.title("cross_validation") 165 | axes = plt.gca() 166 | axes.set_frame_on(False) 167 | 168 | n_folds = 5 169 | n_samples = 25 170 | 171 | n_samples_per_fold = n_samples / float(n_folds) 172 | 173 | 174 | for i in range(n_folds): 175 | colors = ["w"] * n_folds 176 | colors[i] = "grey" 177 | bars = plt.barh(bottom=range(n_folds), width=[n_samples_per_fold - 0.1] * n_folds, 178 | left=i * n_samples_per_fold, height=.6, color=colors, hatch="//") 179 | axes.invert_yaxis() 180 | axes.set_xlim(0, n_samples + 1) 181 | plt.ylabel("CV iterations") 182 | plt.xlabel("Data points") 183 | plt.xticks(np.arange(n_samples_per_fold / 2., n_samples, n_samples_per_fold), ["Fold %d" % x for x in range(1, n_folds + 1)]) 184 | plt.yticks(np.arange(n_folds) + .3, ["Split %d" % x for x in range(1, n_folds + 1)]) 185 | plt.legend([bars[0], bars[4]], ['Training data', 'Test data'], loc=(1.05, 0.4), frameon=False); 186 | 187 | 188 | def plot_threefold_split(): 189 | plt.figure(figsize=(15, 1)) 190 | axis = plt.gca() 191 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], color=['white', 'grey', 'grey'], hatch="//") 192 | bars[2].set_hatch(r"") 193 | axis.set_yticks(()) 194 | axis.set_frame_on(False) 195 | axis.set_ylim(-.1, .8) 196 | axis.set_xlim(-0.1, 20.1) 197 | axis.set_xticks([6, 13.3, 17.5]) 198 | axis.set_xticklabels(["training set", "validation set", "test set"], fontdict={'fontsize': 20}); 199 | axis.tick_params(length=0, labeltop=True, labelbottom=False) 200 | axis.text(6, -.3, "Model fitting", fontdict={'fontsize': 13}, horizontalalignment="center") 201 | axis.text(13.3, -.3, "Parameter selection", fontdict={'fontsize': 13}, horizontalalignment="center") 202 | axis.text(17.5, -.3, "Evaluation", fontdict={'fontsize': 13}, horizontalalignment="center") 203 | -------------------------------------------------------------------------------- /src/mglearn/plot_decomposition.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib.offsetbox import OffsetImage, AnnotationBbox 3 | 4 | 5 | def plot_decomposition(people, pca): 6 | image_shape = people.images[0].shape 7 | plt.figure(figsize=(20, 3)) 8 | ax = plt.gca() 9 | 10 | imagebox = OffsetImage(people.images[0], zoom=1.5, cmap="gray") 11 | ab = AnnotationBbox(imagebox, (.05, 0.4), pad=0.0, xycoords='data') 12 | ax.add_artist(ab) 13 | 14 | for i in range(4): 15 | imagebox = OffsetImage(pca.components_[i].reshape(image_shape), zoom=1.5, cmap="viridis") 16 | 17 | ab = AnnotationBbox(imagebox, (.3 + .2 * i, 0.4), 18 | pad=0.0, 19 | xycoords='data' 20 | ) 21 | ax.add_artist(ab) 22 | if i == 0: 23 | plt.text(.18, .25, 'x_%d *' % i, fontdict={'fontsize': 50}) 24 | else: 25 | plt.text(.15 + .2 * i, .25, '+ x_%d *' % i, fontdict={'fontsize': 50}) 26 | 27 | plt.text(.95, .25, '+ ...', fontdict={'fontsize': 50}) 28 | 29 | plt.rc('text', usetex=True) 30 | plt.text(.13, .3, r'\approx', fontdict={'fontsize': 50}) 31 | plt.axis("off") 32 | -------------------------------------------------------------------------------- /src/mglearn/plot_grid_search.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.svm import SVC 5 | try: 6 | from sklearn.model_selection import GridSearchCV, train_test_split 7 | except: 8 | from sklearn.grid_search import GridSearchCV 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.datasets import load_iris 11 | 12 | 13 | def plot_cross_val_selection(): 14 | iris = load_iris() 15 | X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, 16 | iris.target, 17 | random_state=0) 18 | 19 | param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 20 | 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} 21 | grid_search = GridSearchCV(SVC(), param_grid, cv=5) 22 | grid_search.fit(X_trainval, y_trainval) 23 | scores = pd.DataFrame(grid_search.cv_results_) 24 | 25 | plt.figure(figsize=(10, 3)) 26 | plt.xlim(-1, len(scores)) 27 | plt.ylim(0, 1.1) 28 | 29 | marker_cv, = plt.plot(scores.mean_train_score + scores.std_train_score, '^', c='gray', markersize=5, alpha=.5) 30 | marker_cv, = plt.plot(scores.mean_train_score - scores.std_train_score, '^', c='gray', markersize=5, alpha=.5) 31 | 32 | marker_mean, = plt.plot(scores.mean_train_score, 'v', c='none', alpha=1, markersize=10) 33 | i = np.where(scores.rank_test_score == 1)[0] 34 | best = scores.iloc[i] 35 | marker_best, = plt.plot(i, best.mean_train_score, 'o', c='red', fillstyle="none", alpha=1, markersize=20, markeredgewidth=3) 36 | 37 | plt.xticks(range(len(scores)), [str(scores.iloc[idx].params).strip("{}").replace("'", "") for idx in range(scores.shape[0])], rotation=90); 38 | plt.ylabel("validation accuracy") 39 | plt.xlabel("parameter settings") 40 | plt.legend([marker_cv, marker_mean, marker_best], ["cv accuracy", "mean accuracy", "best parameter setting"], loc=(1.05, .4)) 41 | 42 | 43 | def plot_grid_search_overview(): 44 | plt.figure(figsize=(10, 3)) 45 | axes = plt.gca() 46 | axes.yaxis.set_visible(False) 47 | axes.xaxis.set_visible(False) 48 | axes.set_frame_on(False) 49 | #axes.invert_yaxis() 50 | def draw(ax, text, start, target=None): 51 | if target is not None: 52 | patchB = target.get_bbox_patch() 53 | end = target.get_position() 54 | else: 55 | end = start 56 | patchB = None 57 | annotation = ax.annotate(text, end, start, xycoords='axes pixels', textcoords='axes pixels', size=20, 58 | arrowprops=dict(arrowstyle="-|>", fc="w", ec="k", patchB=patchB, 59 | connectionstyle="arc3,rad=0.0"), 60 | bbox=dict(boxstyle="round", fc="w"), horizontalalignment="center", verticalalignment="center") 61 | plt.draw() 62 | return annotation 63 | 64 | step = 100 65 | grr = 400 66 | 67 | final_evaluation = draw(axes, "final evaluation", (5 * step, grr - 3 * step)) 68 | retrained_model = draw(axes, "retrained model", (3 * step, grr - 3 * step), final_evaluation) 69 | best_parameters = draw(axes, "best parameters", (.5 * step, grr - 3 * step), retrained_model) 70 | cross_validation = draw(axes, "cross validation", (.5 * step, grr - 2 * step), best_parameters) 71 | parameters = draw(axes, "parameter grid", (0.0, grr - 0), cross_validation) 72 | training_data = draw(axes, "training data", (2 * step, grr - step), cross_validation) 73 | draw(axes, "training data", (2 * step, grr - step), retrained_model) 74 | test_data = draw(axes, "test data", (5 * step, grr - step), final_evaluation) 75 | draw(axes, "data set", (3.5 * step, grr - 0.0), training_data) 76 | data_set = draw(axes, "data set", (3.5 * step, grr - 0.0), test_data) 77 | plt.ylim(0, 1) 78 | plt.xlim(0, 1.5) 79 | -------------------------------------------------------------------------------- /src/mglearn/plot_helpers.py: -------------------------------------------------------------------------------- 1 | from matplotlib.colors import ListedColormap 2 | 3 | cm3 = ListedColormap(['b', 'r', 'g']) 4 | cm2 = ListedColormap(['b', 'r']) 5 | -------------------------------------------------------------------------------- /src/mglearn/plot_improper_preprocessing.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def make_bracket(s, xy, textxy, width, ax): 5 | annotation = ax.annotate( 6 | s, xy, textxy, ha="center", va="center", size=20, 7 | arrowprops=dict(arrowstyle="-[", fc="w", ec="k", 8 | lw=2,), bbox=dict(boxstyle="square", fc="w")) 9 | annotation.arrow_patch.get_arrowstyle().widthB = width 10 | 11 | 12 | def plot_improper_processing(): 13 | fig, axes = plt.subplots(2, 1, figsize=(15, 10)) 14 | 15 | for axis in axes: 16 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], 17 | color=['white', 'grey', 'grey'], hatch="//") 18 | bars[2].set_hatch(r"") 19 | axis.set_yticks(()) 20 | axis.set_frame_on(False) 21 | axis.set_ylim(-.1, 6) 22 | axis.set_xlim(-0.1, 20.1) 23 | axis.set_xticks(()) 24 | axis.tick_params(length=0, labeltop=True, labelbottom=False) 25 | axis.text(6, -.3, "training folds", 26 | fontdict={'fontsize': 14}, horizontalalignment="center") 27 | axis.text(13.5, -.3, "validation fold", 28 | fontdict={'fontsize': 14}, horizontalalignment="center") 29 | axis.text(17.5, -.3, "test set", 30 | fontdict={'fontsize': 14}, horizontalalignment="center") 31 | 32 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[0]) 33 | make_bracket("SVC fit", (6, 3), (6, 4), 12, axes[0]) 34 | make_bracket("SVC predict", (13.4, 3), (13.4, 4), 2.5, axes[0]) 35 | 36 | axes[0].set_title("Cross validation") 37 | axes[1].set_title("Test set prediction") 38 | 39 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[1]) 40 | make_bracket("SVC fit", (7.5, 3), (7.5, 4), 15, axes[1]) 41 | make_bracket("SVC predict", (17.5, 3), (17.5, 4), 4.8, axes[1]) 42 | 43 | 44 | def plot_proper_processing(): 45 | fig, axes = plt.subplots(2, 1, figsize=(15, 8)) 46 | 47 | for axis in axes: 48 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], 49 | left=[0, 12, 15], color=['white', 'grey', 'grey'], hatch="//") 50 | bars[2].set_hatch(r"") 51 | axis.set_yticks(()) 52 | axis.set_frame_on(False) 53 | axis.set_ylim(-.1, 4.5) 54 | axis.set_xlim(-0.1, 20.1) 55 | axis.set_xticks(()) 56 | axis.tick_params(length=0, labeltop=True, labelbottom=False) 57 | axis.text(6, -.3, "training folds", fontdict={'fontsize': 14}, horizontalalignment="center") 58 | axis.text(13.5, -.3, "validation fold", fontdict={'fontsize': 14}, horizontalalignment="center") 59 | axis.text(17.5, -.3, "test set", fontdict={'fontsize': 14}, horizontalalignment="center") 60 | 61 | make_bracket("scaler fit", (6, 1.3), (6, 2.), 12, axes[0]) 62 | make_bracket("SVC fit", (6, 3), (6, 4), 12, axes[0]) 63 | make_bracket("SVC predict", (13.4, 3), (13.4, 4), 2.5, axes[0]) 64 | 65 | axes[0].set_title("Cross validation") 66 | axes[1].set_title("Test set prediction") 67 | 68 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[1]) 69 | make_bracket("SVC fit", (7.5, 3), (7.5, 4), 15, axes[1]) 70 | make_bracket("SVC predict", (17.5, 3), (17.5, 4), 4.8, axes[1]) 71 | fig.subplots_adjust(hspace=.3) 72 | -------------------------------------------------------------------------------- /src/mglearn/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | from sklearn.externals.six import StringIO # doctest: +SKIP 7 | from sklearn.tree import export_graphviz 8 | 9 | from scipy import ndimage 10 | from sklearn.datasets import make_moons 11 | from .plot_helpers import cm2 12 | 13 | import re 14 | 15 | 16 | def tree_image(tree, fout=None): 17 | from matplotlib.pyplot import imread 18 | try: 19 | import graphviz 20 | except ImportError: 21 | # make a hacky white plot 22 | x = np.ones((10, 10)) 23 | x[0, 0] = 0 24 | return x 25 | dot_data = StringIO() 26 | export_graphviz(tree, out_file=dot_data, max_depth=3, impurity=False) 27 | data = dot_data.getvalue() 28 | #data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue()) 29 | data = re.sub(r"samples = [0-9]+\\n", "", data) 30 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 31 | data = re.sub(r"value", "counts", data) 32 | 33 | graph = graphviz.Source(data, format="png") 34 | if fout is None: 35 | fout = "tmpfile" 36 | graph.render(fout) 37 | return imread(fout + ".png") 38 | 39 | 40 | def plot_tree_progressive(): 41 | fig, axes = plt.subplots(4, 2, figsize=(15, 25), subplot_kw={'xticks': (), 'yticks': ()}) 42 | X, y = make_moons(n_samples=100, noise=0.25, random_state=3) 43 | 44 | for i, max_depth in enumerate([1, 2, 9]): 45 | tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i + 1, 0]) 46 | axes[i + 1, 1].imshow(tree_image(tree)) 47 | axes[i + 1, 1].set_axis_off() 48 | axes[0, 1].set_visible(False) 49 | for ax in axes[:, 0]: 50 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['r', 'b'])[y], s=60) 51 | X, y = make_moons(noise=0.3, random_state=0) 52 | 53 | 54 | def plot_tree_partition(X, y, tree, ax=None, cm=cm2): 55 | if ax is None: 56 | ax = plt.gca() 57 | eps = X.std() / 2. 58 | 59 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 60 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 61 | xx = np.linspace(x_min, x_max, 1000) 62 | yy = np.linspace(y_min, y_max, 1000) 63 | 64 | X1, X2 = np.meshgrid(xx, yy) 65 | X_grid = np.c_[X1.ravel(), X2.ravel()] 66 | 67 | Z = tree.predict(X_grid) 68 | Z = Z.reshape(X1.shape) 69 | faces = tree.apply(X_grid) 70 | faces = faces.reshape(X1.shape) 71 | border = ndimage.laplace(faces) != 0 72 | ax.contourf(X1, X2, Z, alpha=.4, levels=[0, .5, 1], cmap=cm) 73 | ax.scatter(X1[border], X2[border], marker='.', s=1) 74 | 75 | ax.scatter(X[:, 0], X[:, 1], c=y, s=60, cmap=cm) 76 | ax.set_xlim(x_min, x_max) 77 | ax.set_ylim(y_min, y_max) 78 | ax.set_xticks(()) 79 | ax.set_yticks(()) 80 | return ax 81 | 82 | 83 | def plot_tree(X, y, max_depth=1, ax=None): 84 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=0).fit(X, y) 85 | ax = plot_tree_partition(X, y, tree, ax=ax) 86 | ax.set_title("depth = %d" % max_depth) 87 | return tree 88 | -------------------------------------------------------------------------------- /src/mglearn/plot_kmeans.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import make_blobs 2 | from sklearn.cluster import KMeans 3 | import matplotlib.pyplot as plt 4 | from .plot_2d_separator import plot_2d_classification 5 | from .plot_helpers import cm3 6 | 7 | 8 | def plot_kmeans_algorithm(): 9 | 10 | X, y = make_blobs(random_state=1) 11 | 12 | fig, axes = plt.subplots(2, 3, figsize=(10, 5), subplot_kw={'xticks': (), 'yticks': ()}) 13 | center_args = {'marker': '^', 'c': [0, 1, 2], 'cmap': cm3, 's': 100, 'linewidth': 2} 14 | 15 | axes[0, 0].set_title("Input data") 16 | axes[0, 0].scatter(X[:, 0], X[:, 1], c='w', s=60) 17 | 18 | axes[0, 1].set_title("Initialization") 19 | init = X[:3, :] 20 | axes[0, 1].scatter(X[:, 0], X[:, 1], c='w', s=60) 21 | axes[0, 1].scatter(init[:, 0], init[:, 1], **center_args) 22 | 23 | axes[0, 2].set_title("Assign Points (1)") 24 | km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X) 25 | centers = km.cluster_centers_ 26 | axes[0, 2].scatter(X[:, 0], X[:, 1], c=km.labels_, cmap=cm3, alpha=.6, s=60) 27 | axes[0, 2].scatter(init[:, 0], init[:, 1], **center_args) 28 | 29 | axes[1, 0].set_title("Recompute Centers (1)") 30 | axes[1, 0].scatter(X[:, 0], X[:, 1], c=km.labels_, cmap=cm3, alpha=.6, s=60) 31 | axes[1, 0].scatter(centers[:, 0], centers[:, 1], **center_args) 32 | 33 | axes[1, 1].set_title("Reassign Points (2)") 34 | km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X) 35 | axes[1, 1].scatter(X[:, 0], X[:, 1], c=km.labels_, cmap=cm3, alpha=.6, s=60) 36 | axes[1, 1].scatter(centers[:, 0], centers[:, 1], **center_args) 37 | 38 | axes[1, 2].set_title("Recompute Centers (2)") 39 | centers = km.cluster_centers_ 40 | axes[1, 2].scatter(X[:, 0], X[:, 1], c=km.labels_, cmap=cm3, alpha=.6, s=60) 41 | axes[1, 2].scatter(centers[:, 0], centers[:, 1], **center_args) 42 | 43 | 44 | def plot_kmeans_boundaries(): 45 | X, y = make_blobs(random_state=1) 46 | init = X[:3, :] 47 | km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X) 48 | plt.scatter(X[:, 0], X[:, 1], c=km.labels_, cmap=cm3, alpha=.6, s=60) 49 | plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], 50 | marker='^', c=[0, 1, 2], cmap=cm3, s=100, linewidth=2) 51 | plot_2d_classification(km, X, cm=cm3, alpha=.4) 52 | -------------------------------------------------------------------------------- /src/mglearn/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | 8 | def plot_regression_datasets(): 9 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 10 | for n_samples, ax in zip([10, 100, 1000], axes): 11 | x, y = make_dataset(n_samples) 12 | ax.plot(x, y, 'o', alpha=.6) 13 | 14 | 15 | def plot_kneighbors_regularization(): 16 | rnd = np.random.RandomState(42) 17 | x = np.linspace(-3, 3, 100) 18 | y_no_noise = np.sin(4 * x) + x 19 | y = y_no_noise + rnd.normal(size=len(x)) 20 | X = x[:, np.newaxis] 21 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 22 | 23 | x_test = np.linspace(-3, 3, 1000) 24 | 25 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 26 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 27 | kneighbor_regression.fit(X, y) 28 | ax.plot(x, y_no_noise, label="true function") 29 | ax.plot(x, y, "o", label="data") 30 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 31 | label="prediction") 32 | ax.legend() 33 | ax.set_title("n_neighbors = %d" % n_neighbors) 34 | 35 | if __name__ == "__main__": 36 | plot_kneighbors_regularization() 37 | plt.show() 38 | -------------------------------------------------------------------------------- /src/mglearn/plot_knn_classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.colors import ListedColormap 4 | 5 | from sklearn.metrics import euclidean_distances 6 | from sklearn.neighbors import KNeighborsClassifier 7 | 8 | from .datasets import make_forge 9 | 10 | cm = ListedColormap(["#FF0000", "#0000FF"]) 11 | 12 | 13 | def plot_knn_classification(n_neighbors=1): 14 | X, y = make_forge() 15 | 16 | X_test = np.array([[8.2, 3.66214339], [9.9, 3.2], [11.2, .5]]) 17 | dist = euclidean_distances(X, X_test) 18 | closest = np.argsort(dist, axis=0) 19 | 20 | for x, neighbors in zip(X_test, closest.T): 21 | for neighbor in neighbors[:n_neighbors]: 22 | plt.arrow(x[0], x[1], X[neighbor, 0] - x[0], 23 | X[neighbor, 1] - x[1], head_width=0, fc='k', ec='k') 24 | 25 | clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y) 26 | plt.scatter(X_test[:, 0], X_test[:, 1], marker="x", s=60, 27 | c=clf.predict(X_test), cmap=cm) 28 | plt.scatter(X[:, 0], X[:, 1], c=y, s=60, linewidth=0, cmap=cm) 29 | -------------------------------------------------------------------------------- /src/mglearn/plot_knn_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | from sklearn.metrics import euclidean_distances 6 | 7 | from .datasets import make_wave 8 | 9 | 10 | def plot_knn_regression(n_neighbors=1): 11 | X, y = make_wave(n_samples=40) 12 | X_test = np.array([[-1.5], [0.9], [1.5]]) 13 | 14 | dist = euclidean_distances(X, X_test) 15 | closest = np.argsort(dist, axis=0) 16 | 17 | plt.figure(figsize=(10, 6)) 18 | 19 | reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y) 20 | y_pred = reg.predict(X_test) 21 | 22 | for x, y_, neighbors in zip(X_test, y_pred, closest.T): 23 | for neighbor in neighbors[:n_neighbors]: 24 | plt.arrow(x[0], y_, X[neighbor, 0] - x[0], y[neighbor] - y_, 25 | head_width=0, fc='k', ec='k') 26 | 27 | plt.plot(X, y, 'o') 28 | plt.plot(X, -3 * np.ones(len(X)), 'o') 29 | plt.plot(X_test, -3 * np.ones(len(X_test)), 'x', c='g', markersize=20) 30 | plt.plot(X_test, y_pred, 'x', c='b', markersize=20) 31 | 32 | plt.ylim(-3.1, 3.1) 33 | -------------------------------------------------------------------------------- /src/mglearn/plot_linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.linear_model import LinearRegression 5 | try: 6 | from sklearn.model_selection import train_test_split 7 | except: 8 | from sklearn.cross_validation import train_test_split 9 | from .datasets import make_wave 10 | 11 | 12 | def plot_linear_regression_wave(): 13 | X, y = make_wave(n_samples=60) 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 15 | 16 | line = np.linspace(-3, 3, 100).reshape(-1, 1) 17 | 18 | lr = LinearRegression().fit(X_train, y_train) 19 | print("w[0]: %f b: %f" % (lr.coef_[0], lr.intercept_)) 20 | 21 | plt.figure(figsize=(8, 8)) 22 | plt.plot(X, y, 'o') 23 | plt.plot(X, -3 * np.ones(len(X)), 'o') 24 | plt.plot(line, lr.predict(line)) 25 | ax = plt.gca() 26 | ax.spines['left'].set_position('center') 27 | ax.spines['right'].set_color('none') 28 | ax.spines['bottom'].set_position('center') 29 | ax.spines['top'].set_color('none') 30 | ax.set_ylim(-3, 3) 31 | ax.grid(True) 32 | ax.set_aspect('equal') 33 | 34 | -------------------------------------------------------------------------------- /src/mglearn/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | 6 | 7 | def plot_linear_svc_regularization(): 8 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 9 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 10 | 11 | # a carefully hand-designed dataset lol 12 | y[7] = 0 13 | y[27] = 0 14 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 15 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 16 | 17 | for ax, C in zip(axes, [1e-2, 1, 1e2]): 18 | ax.scatter(X[:, 0], X[:, 1], s=60, c=np.array(['red', 'blue'])[y]) 19 | 20 | svm = SVC(kernel='linear', C=C, tol=0.00001).fit(X, y) 21 | w = svm.coef_[0] 22 | a = -w[0] / w[1] 23 | xx = np.linspace(6, 13) 24 | yy = a * xx - (svm.intercept_[0]) / w[1] 25 | ax.plot(xx, yy, label="C = %.e" % C, c='k') 26 | ax.set_xlim(x_min, x_max) 27 | ax.set_ylim(y_min, y_max) 28 | ax.set_xticks(()) 29 | ax.set_yticks(()) 30 | ax.set_title("C = %f" % C) 31 | 32 | if __name__ == "__main__": 33 | plot_linear_svc_regularization() 34 | plt.show() 35 | -------------------------------------------------------------------------------- /src/mglearn/plot_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from .tools import plot_2d_separator, plot_2d_scores, cm 5 | 6 | 7 | def plot_confusion_matrix_illustration(): 8 | confusion = np.array([[401, 2], [8, 39]]) 9 | plt.title("confusion_matrix") 10 | plt.text(0.45, .6, confusion[0, 0], size=70, horizontalalignment='right') 11 | plt.text(0.45, .1, confusion[1, 0], size=70, horizontalalignment='right') 12 | plt.text(.95, .6, confusion[0, 1], size=70, horizontalalignment='right') 13 | plt.text(.95, 0.1, confusion[1, 1], size=70, horizontalalignment='right') 14 | plt.xticks([.25, .75], ["predicted 'not 4'", "predicted '4'"], size=20) 15 | plt.yticks([.25, .75], ["true '4'", "true 'not 4'"], size=20) 16 | plt.plot([.5, .5], [0, 1], '--', c='k') 17 | plt.plot([0, 1], [.5, .5], '--', c='k') 18 | 19 | plt.xlim(0, 1) 20 | plt.ylim(0, 1) 21 | 22 | 23 | def plot_binary_confusion_matrix(): 24 | plt.title("binary_confusion_matrix_tp_fp") 25 | plt.text(0.45, .6, "TN", size=100, horizontalalignment='right') 26 | plt.text(0.45, .1, "FN", size=100, horizontalalignment='right') 27 | plt.text(.95, .6, "FP", size=100, horizontalalignment='right') 28 | plt.text(.95, 0.1, "TP", size=100, horizontalalignment='right') 29 | plt.xticks([.25, .75], ["predicted negative", "predicted positive"], size=15) 30 | plt.yticks([.25, .75], ["positive class", "negative class"], size=15) 31 | plt.plot([.5, .5], [0, 1], '--', c='k') 32 | plt.plot([0, 1], [.5, .5], '--', c='k') 33 | 34 | plt.xlim(0, 1) 35 | plt.ylim(0, 1) 36 | 37 | 38 | def plot_decision_threshold(): 39 | from .datasets import make_blobs 40 | from sklearn.svm import SVC 41 | try: 42 | from sklearn.model_selection import train_test_split 43 | except: 44 | from sklearn.cross_validation import train_test_split 45 | 46 | X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2], 47 | random_state=22) 48 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 49 | 50 | fig, axes = plt.subplots(2, 3, figsize=(15, 8)) 51 | plt.suptitle("decision_threshold") 52 | axes[0, 0].set_title("training data") 53 | axes[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm) 54 | 55 | svc = SVC(gamma=.05).fit(X_train, y_train) 56 | axes[0, 1].set_title("decision with threshold 0") 57 | axes[0, 1].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm) 58 | plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, 59 | ax=axes[0, 1]) 60 | plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1]) 61 | axes[0, 2].set_title("decision with threshold -0.8") 62 | axes[0, 2].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm) 63 | plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8) 64 | plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, 65 | ax=axes[0, 2]) 66 | 67 | axes[1, 0].set_visible(False) 68 | 69 | mask = np.abs(X_train[:, 1] - 7) < 5 70 | bla = np.sum(mask) 71 | 72 | line = np.linspace(X_train.min(), X_train.max(), 100) 73 | axes[1, 1].set_title("Cross-section with threshold 0") 74 | axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') 75 | contour = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) > 0).reshape(1, -1).repeat(10, axis=0) 76 | axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.2, cmap=cm) 77 | axes[1, 1].scatter(X_train[mask, 0], np.zeros(bla), c=y_train[mask], cmap=cm, alpha=.1, s=100) 78 | axes[1, 1].set_xlim(X_train.min(), X_train.max()) 79 | axes[1, 1].set_ylim(-1.5, 1.5) 80 | axes[1, 1].set_xticks(()) 81 | axes[1, 1].set_ylabel("Decision value") 82 | 83 | contour2 = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) > -.8).reshape(1, -1).repeat(10, axis=0) 84 | axes[1, 2].set_title("Cross-section with threshold -0.8") 85 | axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.2, cmap=cm) 86 | axes[1, 2].scatter(X_train[mask, 0], np.zeros(bla), c=y_train[mask], cmap=cm, alpha=.1, s=100) 87 | axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') 88 | axes[1, 2].set_xlim(X_train.min(), X_train.max()) 89 | axes[1, 2].set_ylim(-1.5, 1.5) 90 | axes[1, 2].set_xticks(()) 91 | axes[1, 2].set_ylabel("Decision value") 92 | -------------------------------------------------------------------------------- /src/mglearn/plot_nmf.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import NMF 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from sklearn.externals.joblib import Memory 6 | 7 | memory = Memory(cachedir="cache") 8 | 9 | 10 | def plot_nmf_illustration(): 11 | rnd = np.random.RandomState(5) 12 | X_ = rnd.normal(size=(300, 2)) 13 | # Add 8 to make sure every point lies in the positive part of the space 14 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8 15 | 16 | nmf = NMF(random_state=0) 17 | nmf.fit(X_blob) 18 | X_nmf = nmf.transform(X_blob) 19 | 20 | fig, axes = plt.subplots(1, 2, figsize=(15, 5)) 21 | 22 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') 23 | axes[0].set_xlabel("feature 1") 24 | axes[0].set_ylabel("feature 2") 25 | axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, 26 | head_width=.3, color='k') 27 | axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1, 28 | head_width=.3, color='k') 29 | axes[0].set_aspect('equal') 30 | axes[0].set_title("NMF with two components") 31 | 32 | # second plot 33 | nmf = NMF(random_state=0, n_components=1) 34 | nmf.fit(X_blob) 35 | 36 | axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, 37 | s=60, cmap='viridis') 38 | axes[1].set_xlabel("feature 1") 39 | axes[1].set_ylabel("feature 2") 40 | axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, 41 | head_width=.3, color='k') 42 | 43 | axes[1].set_aspect('equal') 44 | axes[1].set_title("NMF with one component") 45 | 46 | 47 | @memory.cache 48 | def nmf_faces(X_train, X_test): 49 | # Build NMF models with 10, 50, 100 and 500 components 50 | # this list will hold the back-transformd test-data 51 | reduced_images = [] 52 | for n_components in [10, 50, 100, 500]: 53 | # build the NMF model 54 | nmf = NMF(n_components=n_components, random_state=0) 55 | nmf.fit(X_train) 56 | # transform the test data (afterwards has n_components many dimensions) 57 | X_test_nmf = nmf.transform(X_test) 58 | # back-transform the transformed test-data 59 | # (afterwards it's in the original space again) 60 | X_test_back = np.dot(X_test_nmf, nmf.components_) 61 | reduced_images.append(X_test_back) 62 | return reduced_images 63 | 64 | 65 | def plot_nmf_faces(X_train, X_test, image_shape): 66 | reduced_images = nmf_faces(X_train, X_test) 67 | 68 | # plot the first three images in the test set: 69 | fix, axes = plt.subplots(3, 5, figsize=(15, 12), 70 | subplot_kw={'xticks': (), 'yticks': ()}) 71 | for i, ax in enumerate(axes): 72 | # plot original image 73 | ax[0].imshow(X_test[i].reshape(image_shape), 74 | vmin=0, vmax=1) 75 | # plot the four back-transformed images 76 | for a, X_test_back in zip(ax[1:], reduced_images): 77 | a.imshow(X_test_back[i].reshape(image_shape), vmin=0, vmax=1) 78 | 79 | # label the top row 80 | axes[0, 0].set_title("original image") 81 | for ax, n_components in zip(axes[0, 1:], [10, 50, 100, 500]): 82 | ax.set_title("%d components" % n_components) 83 | -------------------------------------------------------------------------------- /src/mglearn/plot_nn_graphs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def plot_logistic_regression_graph(): 4 | import graphviz 5 | lr_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'}, 6 | graph_attr={'rankdir': 'LR', 'splines': 'line'}) 7 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0") 8 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2") 9 | 10 | for i in range(4): 11 | inputs.node("x[%d]" % i, labelloc="c") 12 | inputs.body.append('label = "inputs"') 13 | inputs.body.append('color = "white"') 14 | 15 | lr_graph.subgraph(inputs) 16 | 17 | output.body.append('label = "output"') 18 | output.body.append('color = "white"') 19 | output.node("y") 20 | 21 | lr_graph.subgraph(output) 22 | 23 | for i in range(4): 24 | lr_graph.edge("x[%d]" % i, "y", label="w[%d]" % i) 25 | return lr_graph 26 | 27 | 28 | def plot_single_hidden_layer_graph(): 29 | import graphviz 30 | nn_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'}, 31 | graph_attr={'rankdir': 'LR', 'splines': 'line'}) 32 | 33 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0") 34 | hidden = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_1") 35 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2") 36 | 37 | for i in range(4): 38 | inputs.node("x[%d]" % i) 39 | 40 | inputs.body.append('label = "inputs"') 41 | inputs.body.append('color = "white"') 42 | 43 | hidden.body.append('label = "hidden layer"') 44 | hidden.body.append('color = "white"') 45 | 46 | for i in range(3): 47 | hidden.node("h%d" % i, label="h[%d]" % i) 48 | 49 | output.node("y") 50 | output.body.append('label = "output"') 51 | output.body.append('color = "white"') 52 | 53 | nn_graph.subgraph(inputs) 54 | nn_graph.subgraph(hidden) 55 | nn_graph.subgraph(output) 56 | 57 | for i in range(4): 58 | for j in range(3): 59 | nn_graph.edge("x[%d]" % i, "h%d" % j) 60 | 61 | for i in range(3): 62 | nn_graph.edge("h%d" % i, "y") 63 | return nn_graph 64 | 65 | 66 | def plot_two_hidden_layer_graph(): 67 | import graphviz 68 | nn_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'}, 69 | graph_attr={'rankdir': 'LR', 'splines': 'line'}) 70 | 71 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0") 72 | hidden = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_1") 73 | hidden2 = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2") 74 | 75 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_3") 76 | 77 | for i in range(4): 78 | inputs.node("x[%d]" % i) 79 | 80 | inputs.body.append('label = "inputs"') 81 | inputs.body.append('color = "white"') 82 | 83 | for i in range(3): 84 | hidden.node("h1[%d]" % i) 85 | 86 | for i in range(3): 87 | hidden2.node("h2[%d]" % i) 88 | 89 | hidden.body.append('label = "hidden layer 1"') 90 | hidden.body.append('color = "white"') 91 | 92 | hidden2.body.append('label = "hidden layer 2"') 93 | hidden2.body.append('color = "white"') 94 | 95 | output.node("y") 96 | output.body.append('label = "output"') 97 | output.body.append('color = "white"') 98 | 99 | nn_graph.subgraph(inputs) 100 | nn_graph.subgraph(hidden) 101 | nn_graph.subgraph(hidden2) 102 | 103 | nn_graph.subgraph(output) 104 | 105 | for i in range(4): 106 | for j in range(3): 107 | nn_graph.edge("x[%d]" % i, "h1[%d]" % j, label="") 108 | 109 | for i in range(3): 110 | for j in range(3): 111 | nn_graph.edge("h1[%d]" % i, "h2[%d]" % j, label="") 112 | 113 | for i in range(3): 114 | nn_graph.edge("h2[%d]" % i, "y", label="") 115 | 116 | return nn_graph 117 | -------------------------------------------------------------------------------- /src/mglearn/plot_pca.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from sklearn.externals.joblib import Memory 6 | 7 | memory = Memory(cachedir="cache") 8 | 9 | 10 | def plot_pca_illustration(): 11 | rnd = np.random.RandomState(5) 12 | X_ = rnd.normal(size=(300, 2)) 13 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) 14 | 15 | pca = PCA() 16 | pca.fit(X_blob) 17 | X_pca = pca.transform(X_blob) 18 | 19 | S = X_pca.std(axis=0) 20 | 21 | fig, axes = plt.subplots(2, 2, figsize=(10, 10)) 22 | axes = axes.ravel() 23 | 24 | axes[0].set_title("Original data") 25 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, 26 | s=60, cmap='viridis') 27 | axes[0].set_xlabel("feature 1") 28 | axes[0].set_ylabel("feature 2") 29 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[0] * pca.components_[0, 0], 30 | S[0] * pca.components_[0, 1], width=.1, head_width=.3, 31 | color='k') 32 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[1] * pca.components_[1, 0], 33 | S[1] * pca.components_[1, 1], width=.1, head_width=.3, 34 | color='k') 35 | axes[0].text(-1.5, -.5, "Component 2", size=14) 36 | axes[0].text(-4, -4, "Component 1", size=14) 37 | axes[0].set_aspect('equal') 38 | 39 | axes[1].set_title("Transformed data") 40 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, 41 | s=60, cmap='viridis') 42 | axes[1].set_xlabel("First principal component") 43 | axes[1].set_ylabel("Second principal component") 44 | axes[1].set_aspect('equal') 45 | axes[1].set_ylim(-8, 8) 46 | 47 | pca = PCA(n_components=1) 48 | pca.fit(X_blob) 49 | X_inverse = pca.inverse_transform(pca.transform(X_blob)) 50 | 51 | axes[2].set_title("Transformed data w/ second component dropped") 52 | axes[2].scatter(X_pca[:, 0], np.zeros(X_pca.shape[0]), c=X_pca[:, 0], 53 | linewidths=0, s=60, cmap='viridis') 54 | axes[2].set_xlabel("First principal component") 55 | axes[2].set_aspect('equal') 56 | axes[2].set_ylim(-8, 8) 57 | 58 | axes[3].set_title("Back-rotation using only first component") 59 | axes[3].scatter(X_inverse[:, 0], X_inverse[:, 1], c=X_pca[:, 0], 60 | linewidths=0, s=60, cmap='viridis') 61 | axes[3].set_xlabel("feature 1") 62 | axes[3].set_ylabel("feature 2") 63 | axes[3].set_aspect('equal') 64 | axes[3].set_xlim(-8, 4) 65 | axes[3].set_ylim(-8, 4) 66 | 67 | 68 | def plot_pca_whitening(): 69 | rnd = np.random.RandomState(5) 70 | X_ = rnd.normal(size=(300, 2)) 71 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) 72 | 73 | pca = PCA(whiten=True) 74 | pca.fit(X_blob) 75 | X_pca = pca.transform(X_blob) 76 | 77 | fig, axes = plt.subplots(1, 2, figsize=(10, 10)) 78 | axes = axes.ravel() 79 | 80 | axes[0].set_title("Original data") 81 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis') 82 | axes[0].set_xlabel("feature 1") 83 | axes[0].set_ylabel("feature 2") 84 | axes[0].set_aspect('equal') 85 | 86 | axes[1].set_title("Whitened data") 87 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis') 88 | axes[1].set_xlabel("First principal component") 89 | axes[1].set_ylabel("Second principal component") 90 | axes[1].set_aspect('equal') 91 | axes[1].set_xlim(-3, 4) 92 | 93 | 94 | @memory.cache 95 | def pca_faces(X_train, X_test): 96 | # copy and pasted from nmf. refactor? 97 | # Build NMF models with 10, 50, 100, 500 and 2000 components 98 | # this list will hold the back-transformd test-data 99 | reduced_images = [] 100 | for n_components in [10, 50, 100, 500, 2000]: 101 | # build the NMF model 102 | pca = PCA(n_components=n_components) 103 | pca.fit(X_train) 104 | # transform the test data (afterwards has n_components many dimensions) 105 | X_test_pca = pca.transform(X_test) 106 | # back-transform the transformed test-data 107 | # (afterwards it's in the original space again) 108 | X_test_back = pca.inverse_transform(X_test_pca) 109 | reduced_images.append(X_test_back) 110 | return reduced_images 111 | 112 | 113 | def plot_pca_faces(X_train, X_test, image_shape): 114 | reduced_images = pca_faces(X_train, X_test) 115 | 116 | # plot the first three images in the test set: 117 | fix, axes = plt.subplots(3, 5, figsize=(15, 12), 118 | subplot_kw={'xticks': (), 'yticks': ()}) 119 | for i, ax in enumerate(axes): 120 | # plot original image 121 | ax[0].imshow(X_test[i].reshape(image_shape), 122 | vmin=0, vmax=1) 123 | # plot the four back-transformed images 124 | for a, X_test_back in zip(ax[1:], reduced_images): 125 | a.imshow(X_test_back[i].reshape(image_shape), vmin=0, vmax=1) 126 | 127 | # label the top row 128 | axes[0, 0].set_title("original image") 129 | for ax, n_components in zip(axes[0, 1:], [10, 50, 100, 500, 2000]): 130 | ax.set_title("%d components" % n_components) 131 | -------------------------------------------------------------------------------- /src/mglearn/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from .plot_2d_separator import plot_2d_separator 5 | from .tools import make_handcrafted_dataset 6 | 7 | 8 | def plot_rbf_svm_parameters(): 9 | X, y = make_handcrafted_dataset() 10 | 11 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 12 | for ax, C in zip(axes, [1e0, 5, 10, 100]): 13 | ax.scatter(X[:, 0], X[:, 1], s=60, c=np.array(['red', 'blue'])[y]) 14 | 15 | svm = SVC(kernel='rbf', C=C).fit(X, y) 16 | plot_2d_separator(svm, X, ax=ax, eps=.5) 17 | ax.set_title("C = %f" % C) 18 | 19 | fig, axes = plt.subplots(1, 4, figsize=(15, 3)) 20 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]): 21 | ax.scatter(X[:, 0], X[:, 1], s=60, c=np.array(['red', 'blue'])[y]) 22 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y) 23 | plot_2d_separator(svm, X, ax=ax, eps=.5) 24 | ax.set_title("gamma = %f" % gamma) 25 | 26 | 27 | def plot_svm(log_C, log_gamma, ax=None): 28 | X, y = make_handcrafted_dataset() 29 | C = 10. ** log_C 30 | gamma = 10. ** log_gamma 31 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 32 | if ax is None: 33 | ax = plt.gca() 34 | plot_2d_separator(svm, X, ax=ax, eps=.5) 35 | # plot data 36 | ax.scatter(X[:, 0], X[:, 1], s=60, c=np.array(['red', 'blue'])[y]) 37 | # plot support vectors 38 | sv = svm.support_vectors_ 39 | ax.scatter(sv[:, 0], sv[:, 1], s=200, facecolors='none', zorder=10, linewidth=3) 40 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 41 | 42 | 43 | def plot_svm_interactive(): 44 | from IPython.html.widgets import interactive, FloatSlider 45 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 46 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 47 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 48 | -------------------------------------------------------------------------------- /src/mglearn/plot_scaling.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.datasets import make_blobs 4 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler 5 | from .plot_helpers import cm2 6 | 7 | 8 | def plot_scaling(): 9 | X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1) 10 | X += 3 11 | 12 | plt.figure(figsize=(15, 8)) 13 | main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2) 14 | 15 | main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60) 16 | maxx = np.abs(X[:, 0]).max() 17 | maxy = np.abs(X[:, 1]).max() 18 | 19 | main_ax.set_xlim(-maxx + 1, maxx + 1) 20 | main_ax.set_ylim(-maxy + 1, maxy + 1) 21 | main_ax.set_title("Original Data") 22 | other_axes = [plt.subplot2grid((2, 4), (i, j)) for j in range(2, 4) for i in range(2)] 23 | 24 | for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(), 25 | MinMaxScaler(), Normalizer(norm='l2')]): 26 | X_ = scaler.fit_transform(X) 27 | ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60) 28 | ax.set_xlim(-2, 2) 29 | ax.set_ylim(-2, 2) 30 | ax.set_title(type(scaler).__name__) 31 | 32 | other_axes.append(main_ax) 33 | 34 | for ax in other_axes: 35 | ax.spines['left'].set_position('center') 36 | ax.spines['right'].set_color('none') 37 | ax.spines['bottom'].set_position('center') 38 | ax.spines['top'].set_color('none') 39 | ax.xaxis.set_ticks_position('bottom') 40 | ax.yaxis.set_ticks_position('left') 41 | -------------------------------------------------------------------------------- /src/mglearn/plot_tree_nonmonotonous.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.datasets import make_blobs 3 | from sklearn.tree import DecisionTreeClassifier, export_graphviz 4 | from matplotlib.colors import ListedColormap 5 | 6 | 7 | cm = ListedColormap(["#FF0000", "#0000FF"]) 8 | 9 | 10 | def plot_tree_not_monotone(): 11 | import graphviz 12 | # make a simple 2d dataset 13 | X, y = make_blobs(centers=4, random_state=8) 14 | y = y % 2 15 | plt.figure() 16 | plt.scatter(X[:, 0], X[:, 1], c=y, s=60, cmap=cm) 17 | 18 | # learn a decision tree model 19 | tree = DecisionTreeClassifier(random_state=0).fit(X, y) 20 | 21 | # visualize the tree 22 | export_graphviz(tree, out_file="mytree.dot", impurity=False, filled=True) 23 | with open("mytree.dot") as f: 24 | dot_graph = f.read() 25 | print("Feature importances: %s" % tree.feature_importances_) 26 | return graphviz.Source(dot_graph) 27 | -------------------------------------------------------------------------------- /src/mglearn/plots.py: -------------------------------------------------------------------------------- 1 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 2 | from .plot_interactive_tree import plot_tree_progressive, plot_tree_partition 3 | from .plot_animal_tree import plot_animal_tree 4 | from .plot_rbf_svm_parameters import plot_svm 5 | from .plot_knn_regression import plot_knn_regression 6 | from .plot_knn_classification import plot_knn_classification 7 | from .plot_2d_separator import plot_2d_classification, plot_2d_separator 8 | from .plot_nn_graphs import (plot_logistic_regression_graph, 9 | plot_single_hidden_layer_graph, 10 | plot_two_hidden_layer_graph) 11 | from .plot_linear_regression import plot_linear_regression_wave 12 | from .plot_tree_nonmonotonous import plot_tree_not_monotone 13 | from .plot_scaling import plot_scaling 14 | from .plot_pca import plot_pca_illustration, plot_pca_whitening, plot_pca_faces 15 | from .plot_decomposition import plot_decomposition 16 | from .plot_nmf import plot_nmf_illustration, plot_nmf_faces 17 | from .plot_helpers import cm2, cm3 18 | from .plot_agglomerative import plot_agglomerative, plot_agglomerative_algorithm 19 | from .plot_kmeans import plot_kmeans_algorithm, plot_kmeans_boundaries 20 | from .plot_improper_preprocessing import plot_improper_processing, plot_proper_processing 21 | from .plot_cross_validation import (plot_threefold_split, plot_label_kfold, 22 | plot_shuffle_split, plot_cross_validation, 23 | plot_stratified_cross_validation) 24 | 25 | from .plot_grid_search import plot_grid_search_overview, plot_cross_val_selection 26 | from .plot_metrics import (plot_confusion_matrix_illustration, 27 | plot_binary_confusion_matrix, 28 | plot_decision_threshold) 29 | 30 | __all__ = ['plot_linear_svc_regularization', 31 | "plot_animal_tree", "plot_tree_progressive", 32 | 'plot_tree_partition', 'plot_svm', 33 | 'plot_knn_regression', 34 | 'plot_logistic_regression_graph', 35 | 'plot_single_hidden_layer_graph', 36 | 'plot_two_hidden_layer_graph', 37 | 'plot_2d_classification', 38 | 'plot_2d_separator', 39 | 'plot_knn_classification', 40 | 'plot_linear_regression_wave', 41 | 'plot_tree_not_monotone', 42 | 'plot_scaling', 43 | 'plot_pca_illustration', 44 | 'plot_pca_faces', 45 | 'plot_pca_whitening', 46 | 'plot_decomposition', 47 | 'plot_nmf_illustration', 48 | 'plot_nmf_faces', 49 | 'plot_agglomerative', 50 | 'plot_agglomerative_algorithm', 51 | 'plot_kmeans_boundaries', 52 | 'plot_kmeans_algorithm', 53 | 'cm3', 'cm2', 'plot_improper_processing', 'plot_proper_processing', 54 | 'plot_label_kfold', 55 | 'plot_shuffle_split', 56 | 'plot_stratified_cross_validation', 57 | 'plot_threefold_split', 58 | 'plot_cross_validation', 59 | 'plot_grid_search_overview', 60 | 'plot_cross_val_selection', 61 | 'plot_confusion_matrix_illustration', 62 | 'plot_binary_confusion_matrix', 63 | 'plot_decision_threshold' 64 | ] 65 | -------------------------------------------------------------------------------- /src/mglearn/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_blobs 3 | from sklearn.tree import export_graphviz 4 | import matplotlib.pyplot as plt 5 | from .plot_2d_separator import plot_2d_separator, plot_2d_classification, plot_2d_scores 6 | from .plot_helpers import cm2 as cm 7 | 8 | 9 | def visualize_coefficients(coefficients, feature_names, n_top_features=25): 10 | # get coefficients with large absolute values 11 | coef = coefficients.ravel() 12 | positive_coefficients = np.argsort(coef)[-n_top_features:] 13 | negative_coefficients = np.argsort(coef)[:n_top_features] 14 | interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients]) 15 | # plot them 16 | plt.figure(figsize=(15, 5)) 17 | colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]] 18 | plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors) 19 | feature_names = np.array(feature_names) 20 | plt.subplots_adjust(bottom=0.3) 21 | plt.xticks(np.arange(1, 1 + 2 * n_top_features), 22 | feature_names[interesting_coefficients], rotation=60, ha="right") 23 | 24 | 25 | def heatmap(values, xlabel, ylabel, xticklabels, yticklabels, cmap=None, 26 | vmin=None, vmax=None, ax=None, fmt="%0.2f"): 27 | if ax is None: 28 | ax = plt.gca() 29 | # plot the mean cross-validation scores 30 | img = ax.pcolor(values, cmap=cmap, vmin=None, vmax=None) 31 | img.update_scalarmappable() 32 | ax.set_xlabel(xlabel) 33 | ax.set_ylabel(ylabel) 34 | ax.set_xticks(np.arange(len(xticklabels)) + .5) 35 | ax.set_yticks(np.arange(len(yticklabels)) + .5) 36 | ax.set_xticklabels(xticklabels) 37 | ax.set_yticklabels(yticklabels) 38 | ax.set_aspect(1) 39 | 40 | for p, color, value in zip(img.get_paths(), img.get_facecolors(), img.get_array()): 41 | x, y = p.vertices[:-2, :].mean(0) 42 | if np.mean(color[:3]) > 0.5: 43 | c = 'k' 44 | else: 45 | c = 'w' 46 | ax.text(x, y, fmt % value, color=c, ha="center", va="center") 47 | return img 48 | 49 | 50 | def make_handcrafted_dataset(): 51 | # a carefully hand-designed dataset lol 52 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 53 | y[np.array([7, 27])] = 0 54 | mask = np.ones(len(X), dtype=np.bool) 55 | mask[np.array([0, 1, 5, 26])] = 0 56 | X, y = X[mask], y[mask] 57 | return X, y 58 | 59 | 60 | def print_topics(topics, feature_names, sorting, topics_per_chunk=6, n_words=20): 61 | for i in range(0, len(topics), topics_per_chunk): 62 | # for each chunk: 63 | these_topics = topics[i: i + topics_per_chunk] 64 | # maybe we have less than topics_per_chunk left 65 | len_this_chunk = len(these_topics) 66 | # print topic headers 67 | print(("topic {:<8}" * len_this_chunk).format(*these_topics)) 68 | print(("-------- {0:<5}" * len_this_chunk).format("")) 69 | # print top n_words frequent words 70 | for i in range(n_words): 71 | try: 72 | print(("{:<14}" * len_this_chunk).format(*feature_names[sorting[these_topics, i]])) 73 | except: 74 | pass 75 | print("\n") 76 | 77 | 78 | def get_tree(tree, **kwargs): 79 | try: 80 | # python3 81 | from io import StringIO 82 | except ImportError: 83 | # python2 84 | from StringIO import StringIO 85 | f = StringIO() 86 | export_graphviz(tree, f, **kwargs) 87 | import graphviz 88 | return graphviz.Source(f.getvalue()) 89 | 90 | __all__ = ['plot_2d_separator', 'plot_2d_classification', 91 | 'plot_2d_scores', 'cm', 'visualize_coefficients', 'print_topics', 'heatmap'] 92 | -------------------------------------------------------------------------------- /src/over_under_fit.py: -------------------------------------------------------------------------------- 1 | """Slightly modified from: 2 | http://scikit-learn.org/stable/auto_examples/model_selection/plot_underfitting_overfitting.html 3 | http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html 4 | http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html 5 | 6 | This example demonstrates the problems of underfitting and overfitting and how we can use 7 | linear regression with polynomial features to approximate nonlinear functions. The plot 8 | shows the function that we want to approximate, which is a part of the cosine function. 9 | In addition, the samples from the real function and the approximations of different models 10 | are displayed. The models have polynomial features of different degrees. We can see that 11 | a linear function (polynomial with degree 1) is not sufficient to fit the training samples. 12 | This is called underfitting. A polynomial of degree 4 approximates the true function almost 13 | perfectly. However, for higher degrees the model will overfit the training data, i.e. it 14 | learns the noise of the training data. We evaluate quantitatively overfitting / underfitting 15 | by using cross-validation. We calculate the mean squared error (MSE) on the validation set, 16 | the higher, the less likely the model generalizes correctly from the training data. 17 | """ 18 | import numpy as np 19 | import matplotlib.pyplot as plt 20 | from sklearn.pipeline import Pipeline 21 | from sklearn.preprocessing import PolynomialFeatures 22 | from sklearn.linear_model import LinearRegression 23 | from sklearn.model_selection import cross_val_score 24 | from sklearn.datasets import make_blobs 25 | from sklearn.cluster import k_means 26 | 27 | def true_fun(X): 28 | return np.cos(1.5 * np.pi * X) 29 | 30 | np.random.seed(0) 31 | 32 | def show(n_samples=25, degrees=[1,4,15]): 33 | X = np.sort(np.random.rand(n_samples)) 34 | y = true_fun(X) + np.random.randn(n_samples) * 0.1 35 | plt.figure(figsize=(8, 12)) 36 | 37 | for i in range(len(degrees)): 38 | ax = plt.subplot(len(degrees), 1, i + 1) 39 | plt.setp(ax, xticks=(), yticks=()) 40 | 41 | polynomial_features = PolynomialFeatures(degree=degrees[i], 42 | include_bias=False) 43 | linear_regression = LinearRegression() 44 | pipeline = Pipeline([("polynomial_features", polynomial_features), 45 | ("linear_regression", linear_regression)]) 46 | pipeline.fit(X[:, np.newaxis], y) 47 | 48 | X_test = np.linspace(0, 1, 100) 49 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 50 | plt.plot(X_test, true_fun(X_test), label="True function") 51 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 52 | plt.xlabel("x") 53 | plt.ylabel("y") 54 | plt.xlim((0, 1)) 55 | plt.ylim((-2, 2)) 56 | plt.legend(loc="best") 57 | plt.title("Poly-fit degree %d" % degrees[i]) 58 | 59 | plt.tight_layout() 60 | plt.show() 61 | 62 | def doc(): 63 | print(__doc__) 64 | 65 | #### the clustering parts 66 | blobs, classes = make_blobs(n_samples=500, 67 | centers=[(-4, -4), (-2, 2), (2, 0)], 68 | random_state=42) 69 | 70 | 71 | def cluster(n_clusters, known=False): 72 | centers, _classes, inertia = k_means(blobs, n_clusters=n_clusters) 73 | if known: 74 | _classes = classes 75 | plt.figure(figsize=(6, 6)) 76 | plt.scatter(blobs[:,0], blobs[:,1], 77 | c=np.array([('bgrcmykw'*10)[n] for n in _classes]), 78 | marker='.') 79 | plt.show() 80 | -------------------------------------------------------------------------------- /src/plot_cluster_comparison.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================================================= 3 | Comparing different clustering algorithms on toy datasets 4 | ========================================================= 5 | 6 | This example shows characteristics of different 7 | clustering algorithms on datasets that are "interesting" 8 | but still in 2D. With the exception of the last dataset, 9 | the parameters of each of these dataset-algorithm pairs 10 | has been tuned to produce good clustering results. Some 11 | algorithms are more sensitive to parameter values than 12 | others. 13 | 14 | The last dataset is an example of a 'null' situation for 15 | clustering: the data is homogeneous, and there is no good 16 | clustering. For this example, the null dataset uses the 17 | same parameters as the dataset in the row above it, which 18 | represents a mismatch in the parameter values and the 19 | data structure. 20 | 21 | While these examples give some intuition about the 22 | algorithms, this intuition might not apply to very high 23 | dimensional data. 24 | """ 25 | print(__doc__) 26 | 27 | import time 28 | import warnings 29 | 30 | import numpy as np 31 | import matplotlib.pyplot as plt 32 | 33 | from sklearn import cluster, datasets, mixture 34 | from sklearn.neighbors import kneighbors_graph 35 | from sklearn.preprocessing import StandardScaler 36 | from itertools import cycle, islice 37 | 38 | np.random.seed(0) 39 | 40 | # ============ 41 | # Generate datasets. We choose the size big enough to see the scalability 42 | # of the algorithms, but not too big to avoid too long running times 43 | # ============ 44 | n_samples = 1500 45 | noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, 46 | noise=.05) 47 | noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) 48 | blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) 49 | no_structure = np.random.rand(n_samples, 2), None 50 | 51 | # Anisotropicly distributed data 52 | random_state = 170 53 | X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) 54 | transformation = [[0.6, -0.6], [-0.4, 0.8]] 55 | X_aniso = np.dot(X, transformation) 56 | aniso = (X_aniso, y) 57 | 58 | # blobs with varied variances 59 | varied = datasets.make_blobs(n_samples=n_samples, 60 | cluster_std=[1.0, 2.5, 0.5], 61 | random_state=random_state) 62 | 63 | # ============ 64 | # Set up cluster parameters 65 | # ============ 66 | plt.figure(figsize=(9 * 2 + 3, 12.5)) 67 | plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, 68 | hspace=.01) 69 | 70 | plot_num = 1 71 | 72 | default_base = {'quantile': .3, 73 | 'eps': .3, 74 | 'damping': .9, 75 | 'preference': -200, 76 | 'n_neighbors': 10, 77 | 'n_clusters': 3} 78 | 79 | datasets = [ 80 | (noisy_circles, {'damping': .77, 'preference': -240, 81 | 'quantile': .2, 'n_clusters': 2}), 82 | (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), 83 | (varied, {'eps': .18, 'n_neighbors': 2}), 84 | (aniso, {'eps': .15, 'n_neighbors': 2}), 85 | (blobs, {}), 86 | (no_structure, {})] 87 | 88 | for i_dataset, (dataset, algo_params) in enumerate(datasets): 89 | # update parameters with dataset-specific values 90 | params = default_base.copy() 91 | params.update(algo_params) 92 | 93 | X, y = dataset 94 | 95 | # normalize dataset for easier parameter selection 96 | X = StandardScaler().fit_transform(X) 97 | 98 | # estimate bandwidth for mean shift 99 | bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) 100 | 101 | # connectivity matrix for structured Ward 102 | connectivity = kneighbors_graph( 103 | X, n_neighbors=params['n_neighbors'], include_self=False) 104 | # make connectivity symmetric 105 | connectivity = 0.5 * (connectivity + connectivity.T) 106 | 107 | # ============ 108 | # Create cluster objects 109 | # ============ 110 | ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) 111 | two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) 112 | ward = cluster.AgglomerativeClustering( 113 | n_clusters=params['n_clusters'], linkage='ward', 114 | connectivity=connectivity) 115 | spectral = cluster.SpectralClustering( 116 | n_clusters=params['n_clusters'], eigen_solver='arpack', 117 | affinity="nearest_neighbors") 118 | dbscan = cluster.DBSCAN(eps=params['eps']) 119 | affinity_propagation = cluster.AffinityPropagation( 120 | damping=params['damping'], preference=params['preference']) 121 | average_linkage = cluster.AgglomerativeClustering( 122 | linkage="average", affinity="cityblock", 123 | n_clusters=params['n_clusters'], connectivity=connectivity) 124 | birch = cluster.Birch(n_clusters=params['n_clusters']) 125 | gmm = mixture.GaussianMixture( 126 | n_components=params['n_clusters'], covariance_type='full') 127 | 128 | clustering_algorithms = ( 129 | ('MiniBatchKMeans', two_means), 130 | ('AffinityPropagation', affinity_propagation), 131 | ('MeanShift', ms), 132 | ('SpectralClustering', spectral), 133 | ('Ward', ward), 134 | ('AgglomerativeClustering', average_linkage), 135 | ('DBSCAN', dbscan), 136 | ('Birch', birch), 137 | ('GaussianMixture', gmm) 138 | ) 139 | 140 | for name, algorithm in clustering_algorithms: 141 | t0 = time.time() 142 | 143 | # catch warnings related to kneighbors_graph 144 | with warnings.catch_warnings(): 145 | warnings.filterwarnings( 146 | "ignore", 147 | message="the number of connected components of the " + 148 | "connectivity matrix is [0-9]{1,2}" + 149 | " > 1. Completing it to avoid stopping the tree early.", 150 | category=UserWarning) 151 | warnings.filterwarnings( 152 | "ignore", 153 | message="Graph is not fully connected, spectral embedding" + 154 | " may not work as expected.", 155 | category=UserWarning) 156 | algorithm.fit(X) 157 | 158 | t1 = time.time() 159 | if hasattr(algorithm, 'labels_'): 160 | y_pred = algorithm.labels_.astype(np.int) 161 | else: 162 | y_pred = algorithm.predict(X) 163 | 164 | plt.subplot(len(datasets), len(clustering_algorithms), plot_num) 165 | if i_dataset == 0: 166 | plt.title(name, size=18) 167 | 168 | colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', 169 | '#f781bf', '#a65628', '#984ea3', 170 | '#999999', '#e41a1c', '#dede00']), 171 | int(max(y_pred) + 1)))) 172 | # add black color for outliers (if any) 173 | colors = np.append(colors, ["#000000"]) 174 | plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) 175 | 176 | plt.xlim(-2.5, 2.5) 177 | plt.ylim(-2.5, 2.5) 178 | plt.xticks(()) 179 | plt.yticks(()) 180 | plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), 181 | transform=plt.gca().transAxes, size=15, 182 | horizontalalignment='right') 183 | plot_num += 1 184 | 185 | plt.show() 186 | -------------------------------------------------------------------------------- /src/time_regressors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn import datasets 4 | california = datasets.california_housing.fetch_california_housing() 5 | X, y = california.data, california.target 6 | 7 | from sklearn.model_selection import train_test_split 8 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) 9 | 10 | from sklearn.neighbors import KNeighborsRegressor 11 | from sklearn.linear_model import LinearRegression, RANSACRegressor 12 | from sklearn.gaussian_process import GaussianProcessRegressor 13 | from sklearn.svm import SVR 14 | from sklearn.svm import LinearSVR 15 | regressors = [ 16 | LinearRegression(), 17 | RANSACRegressor(), 18 | KNeighborsRegressor(), 19 | KNeighborsRegressor(n_neighbors=9, metric='manhattan'), 20 | SVR(), 21 | LinearSVR(), 22 | SVR(kernel='linear'), # Cf. LinearSVR: much slower, might be better or worse: 23 | GaussianProcessRegressor(), 24 | ] 25 | 26 | from sklearn.metrics import explained_variance_score 27 | from sklearn.metrics import mean_absolute_error 28 | from sklearn.metrics import r2_score 29 | from time import time 30 | 31 | for model in regressors: 32 | start = time() 33 | model.fit(X_train, y_train) 34 | train_time = time() - start 35 | start = time() 36 | predictions = model.predict(X_test) 37 | predict_time = time()-start 38 | print(model) 39 | print("\tTraining time: %0.3fs" % train_time) 40 | print("\tPrediction time: %0.3fs" % predict_time) 41 | print("\tExplained variance:", explained_variance_score(y_test, predictions)) 42 | print("\tMean absolute error:", mean_absolute_error(y_test, predictions)) 43 | print("\tR2 score:", r2_score(y_test, predictions)) 44 | --------------------------------------------------------------------------------